import numpy as np
np.set_printoptions(legacy='1.21')
import seaborn as sns
sns.set_theme()
sns.set_context('talk')
import pandas as pd
from scipy import stats

stats.binomtest(35, 50)

BinomTestResult(k=35, n=50, alternative='two-sided', statistic=0.7, pvalue=0.006600447966810918)

P = stats.binom(100, 0.0066)

p0 = P.pmf(0)
p0

0.5157218904013275

1 - p0

0.48427810959867246

100/1.5

66.66666666666667

66 * 66 * 66

287496

u_schad = "https://www.math.uni-duesseldorf.de/~braun/bio2324/data/schadstoffe.csv"
schadstoff = pd.read_csv(u_schad, index_col=0)
schadstoff.head()

sns.displot(data=schadstoff, x='Konzentration', col='Messstelle');

g1 = schadstoff[schadstoff.Messstelle==1].Konzentration
g1

2     0.000589
3     0.000950
13    0.001301
14    0.001605
18    0.000927
22    0.001250
28    0.000965
33    0.000669
41    0.000712
42    0.001019
45    0.000780
54    0.001306
61    0.001006
64    0.001057
65    0.000381
70    0.000919
74    0.001323
Name: Konzentration, dtype: float64

g2 = schadstoff[schadstoff.Messstelle==2].Konzentration
g3 = schadstoff[schadstoff.Messstelle==3].Konzentration
g4 = schadstoff[schadstoff.Messstelle==4].Konzentration
g5 = schadstoff[schadstoff.Messstelle==5].Konzentration

g4.head()

4     0.001152
5     0.001318
6     0.000849
8     0.000982
23    0.000505
Name: Konzentration, dtype: float64

res = stats.f_oneway(g1, g2, g3, g4, g5)
res

F_onewayResult(statistic=0.8666121588849811, pvalue=0.48807057520065544)

P = stats.f(4, 75)
1 - P.cdf(res.statistic)

0.48807057520065544

res.pvalue

0.48807057520065544

pingus = sns.load_dataset("penguins") 
pingus.head()

sns.displot(pingus, x='bill_length_mm', hue='species', multiple='stack');

pingus.species.value_counts()

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

gA = pingus[pingus.species=='Adelie'].bill_length_mm
gA.head()

0    39.1
1    39.5
2    40.3
3     NaN
4    36.7
Name: bill_length_mm, dtype: float64

gG = pingus[pingus.species=='Gentoo'].bill_length_mm
gC = pingus[pingus.species=='Chinstrap'].bill_length_mm

stats.f_oneway(gA, gG, gC)

F_onewayResult(statistic=nan, pvalue=nan)

pingus_alle_daten = pingus.dropna()

gA = pingus_alle_daten[pingus_alle_daten.species=='Adelie'].bill_length_mm
gG = pingus_alle_daten[pingus_alle_daten.species=='Gentoo'].bill_length_mm
gC = pingus_alle_daten[pingus_alle_daten.species=='Chinstrap'].bill_length_mm

res = stats.f_oneway(gA, gG, gC)
res

F_onewayResult(statistic=397.2994374128277, pvalue=1.3809842053153047e-88)

r1 = stats.ttest_ind(gA, gG)
r1

TtestResult(statistic=-24.66879239628207, pvalue=2.2112060856021175e-70, df=263.0)

r2 = stats.ttest_ind(gA, gC)
r2

TtestResult(statistic=-23.562058327794357, pvalue=3.988191872307172e-61, df=212.0)

r3 = stats.ttest_ind(gG, gC)
r3

TtestResult(statistic=-2.608098387774673, pvalue=0.00984830289764215, df=185.0)

from statsmodels.sandbox.stats.multicomp import MultiComparison

muc = MultiComparison(pingus.dropna().bill_length_mm, pingus.dropna().species)

res = muc.allpairtest(stats.ttest_ind, alpha=0.01, method='bonferroni')
res[0]

res = muc.allpairtest(stats.ttest_ind, alpha=0.01, method='holm')
res[0]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
3	Adelie	Torgersen	NaN	NaN	NaN	NaN	NaN
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female

Mathematik für Biologiestudierende¶

Wiederholung (interaktiv)¶

Themen heute¶

Data Snooping¶

Szenario 1¶

Szenario 2¶

Gefahren des Data Snooping¶

Fazit¶

Multiple Vergleiche¶

Bonferroni-Korrektur¶

Bonferroni-Holm-Korrektur¶

False Discovery Rate¶

Gruppenvergleiche¶

ANOVA¶

Beispiel Schadstoffkonzentration¶

Haben unterschiedliche Pinguinarten unterschiedliche Schnabellängen?¶

Automatische Paarvergleiche¶

Bonferroni-Holm¶

Bei den Pinguinen¶

group1	group2	stat	pval	pval_corr	reject
Adelie	Chinstrap	-23.5621	0.0	0.0	True
Adelie	Gentoo	-24.6688	0.0	0.0	True
Chinstrap	Gentoo	2.6081	0.0098	0.0295	False

	Messstelle	Konzentration
0	5	0.000867
1	3	0.000490
2	1	0.000589
3	1	0.000950
4	4	0.001152