import numpy as np
np.set_printoptions(legacy='1.21')
import pandas as pd
from scipy import stats
import seaborn as sns
sns.set_theme()
import statsmodels.formula.api as smf

df = pd.read_csv('galton.csv')

formel = "childHeight ~ father + mother"
model = smf.ols(formel, df)

res = model.fit()

res.summary()

formel = "childHeight ~ father + mother + midparentHeight"

modell = smf.ols(formel, df)
res = modell.fit()

res.summary()

df['m1'] = df.mother - 0.0001

f1 = "childHeight ~ m1 + father + midparentHeight"
m1 = smf.ols(f1, df)
r1 = m1.fit()

r1.summary()

df = pd.read_csv('kinder.csv')
df.head()

sns.lmplot(df, x='father', y='childHeight', hue='gender');

formel = 'childHeight ~ father + mother + gender'
modell = smf.ols(formel, df)

res = modell.fit()

res.summary()

anfrage = pd.DataFrame()
anfrage['father'] = [68.9]
anfrage['mother'] = [66.4]
anfrage['gender'] = ['male']
anfrage

res.get_prediction(anfrage).summary_frame()

df = pd.read_csv('ratten.csv')

sns.lmplot(df, x='Alter', y='Belastung', hue='Gelände');

formel = 'Belastung ~ Alter + Gelände'
modell = smf.ols(formel, df)
res = modell.fit()

res.summary()

anfrage = pd.DataFrame()
anfrage['Alter'] = [8,8,9,9]
anfrage['Gelände'] = ['kontaminiert', 'unkontaminiert', 'kontaminiert', 'unkontaminiert']
anfrage

res.get_prediction(anfrage).summary_frame()

res.get_prediction(anfrage).summary_frame(alpha=0.02)

Dep. Variable:	childHeight	R-squared:	0.238
Model:	OLS	Adj. R-squared:	0.235
Method:	Least Squares	F-statistic:	74.62
Date:	Wed, 18 Jun 2025	Prob (F-statistic):	6.25e-29
Time:	14:20:44	Log-Likelihood:	-1080.7
No. Observations:	481	AIC:	2167.
Df Residuals:	478	BIC:	2180.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	19.3128	4.095	4.716	0.000	11.266	27.359
father	0.4176	0.046	9.154	0.000	0.328	0.507
mother	0.3288	0.045	7.258	0.000	0.240	0.418

Omnibus:	10.653	Durbin-Watson:	1.592
Prob(Omnibus):	0.005	Jarque-Bera (JB):	14.542
Skew:	-0.200	Prob(JB):	0.000695
Kurtosis:	3.752	Cond. No.	3.69e+03

Dep. Variable:	childHeight	R-squared:	0.238
Model:	OLS	Adj. R-squared:	0.235
Method:	Least Squares	F-statistic:	74.62
Date:	Wed, 18 Jun 2025	Prob (F-statistic):	6.25e-29
Time:	14:20:45	Log-Likelihood:	-1080.7
No. Observations:	481	AIC:	2167.
Df Residuals:	478	BIC:	2180.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	19.3128	4.095	4.716	0.000	11.266	27.359
father	0.2923	0.040	7.361	0.000	0.214	0.370
mother	0.1935	0.038	5.055	0.000	0.118	0.269
midparentHeight	0.2506	0.021	12.091	0.000	0.210	0.291

Mathematik für Biologiestudierende II¶

Themen¶

Multikollinearität¶

Konditionszahl¶

Ein nicht statistisches Beispiel mit schlechter Konditionszahl¶

Beispiel mit wirklich schlechter Kondition¶

Lineare Modelle mit kategoriellen Daten¶

Prediction im kategoriellen Fall¶

Beispiel Ratten¶

Omnibus:	10.752	Durbin-Watson:	1.592
Prob(Omnibus):	0.005	Jarque-Bera (JB):	14.794
Skew:	-0.199	Prob(JB):	0.000613
Kurtosis:	3.762	Cond. No.	9.71e+14

	family	father	mother	midparentHeight	children	childNum	gender	childHeight
0	001	78.5	67.0	75.43	4	1	male	73.2
1	001	78.5	67.0	75.43	4	2	female	69.2
2	001	78.5	67.0	75.43	4	3	female	69.0
3	001	78.5	67.0	75.43	4	4	female	69.0
4	002	75.5	66.5	73.66	4	1	male	73.5

Omnibus:	11.156	Durbin-Watson:	1.549
Prob(Omnibus):	0.004	Jarque-Bera (JB):	15.397
Skew:	-0.114	Prob(JB):	0.000453
Kurtosis:	3.586	Cond. No.	3.63e+03

Dep. Variable:	Belastung	R-squared:	0.673
Model:	OLS	Adj. R-squared:	0.635
Method:	Least Squares	F-statistic:	17.52
Date:	Wed, 18 Jun 2025	Prob (F-statistic):	7.42e-05
Time:	14:20:46	Log-Likelihood:	-63.935
No. Observations:	20	AIC:	133.9
Df Residuals:	17	BIC:	136.9
Df Model:	2
Covariance Type:	nonrobust

Omnibus:	1.119	Durbin-Watson:	2.547
Prob(Omnibus):	0.572	Jarque-Bera (JB):	0.408
Skew:	-0.346	Prob(JB):	0.815
Kurtosis:	3.102	Cond. No.	33.2

	Alter	Gelände
0	8	kontaminiert
1	8	unkontaminiert
2	9	kontaminiert
3	9	unkontaminiert

	mean	mean_se	mean_ci_lower	mean_ci_upper	obs_ci_lower	obs_ci_upper
0	67.564695	2.037898	63.265107	71.864283	53.358146	81.771244
1	56.466728	2.284487	51.646882	61.286575	42.094168	70.839289
2	71.113678	2.182221	66.509596	75.717761	56.812030	85.415327
3	60.015712	2.074920	55.638014	64.393409	45.785328	74.246095

	mean	mean_se	mean_ci_lower	mean_ci_upper	obs_ci_lower	obs_ci_upper
0	67.564695	2.037898	62.333546	72.795844	50.280117	84.849274
1	56.466728	2.284487	50.602601	62.330856	38.980170	73.953287
2	71.113678	2.182221	65.512062	76.715294	53.713396	88.513961
3	60.015712	2.074920	54.689530	65.341893	42.702135	77.329289

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	1.36e+07	3.78e+07	0.359	0.719	-6.08e+07	8.8e+07
m1	1.36e+11	3.78e+11	0.359	0.719	-6.08e+11	8.8e+11
father	1.259e+11	3.5e+11	0.359	0.719	-5.63e+11	8.14e+11
midparentHeight	-2.519e+11	7.01e+11	-0.359	0.719	-1.63e+12	1.13e+12

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	16.5212	2.727	6.058	0.000	11.169	21.873
gender[T.male]	5.2150	0.142	36.775	0.000	4.937	5.493
father	0.3928	0.029	13.699	0.000	0.337	0.449
mother	0.3176	0.031	10.245	0.000	0.257	0.378

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	39.1728	5.166	7.583	0.000	28.273	50.072
Gelände[T.unkontaminiert]	-11.0980	3.124	-3.552	0.002	-17.689	-4.507
Alter	3.5490	0.617	5.752	0.000	2.247	4.851