![](https://mmbiz.qpic.cn/mmbiz_png/ibOFjxwickib46pKW3iaXoQmvy1cKCjJdK5RbOGJxWuqJX7iaZUcRcbSZgall5SgbgAuLPBqFTLCqUGcNks6Enc23uQ/640?wx_fmt=png)
pingouin是基于Pandas和numpy开发的Python3统计包。主要统计功能有
安装
pip3 install pingouin
快速上手
构造实验数据x,y
import numpy as np
#控制代码每次随机状态保持一致
np.random.seed(666)
n=30
mean= [4,5]
cov = [(1, 0.6), (0.6, 1)]
x, y = np.random.multivariate_normal(mean, cov, n).T
x
array([3.04817645, 2.54387965, 4.56033188, 4.40504338, 3.77876203,
3.87177128, 3.4546112 , 4.47317551, 5.23133856, 5.40273745,
5.19344217, 3.37061786, 3.23980982, 2.85574177, 4.67728276,
4.31935242, 4.39440207, 3.87458876, 4.91426293, 3.13673286,
3.73459839, 4.18708647, 5.48558345, 3.7066784 , 3.73400287,
3.49664637, 3.95954844, 2.61545452, 5.11352964, 5.62666503])
y
array([4.47747109, 4.35695696, 5.46239455, 4.56091782, 4.07534588,
4.03904897, 3.79549165, 5.06121364, 5.71635355, 6.60772697,
6.94890455, 5.13347618, 5.41207983, 3.38254684, 5.49705058,
5.93394729, 4.65224366, 4.59491971, 5.17926604, 4.25844527,
5.72809738, 5.14997732, 5.27606588, 4.94570454, 6.02889647,
5.85451666, 4.90231286, 4.69242625, 4.69367432, 6.71644528])
import matplotlib.pyplot as plt
plt.hist(x, bins=10)
![](https://mmbiz.qpic.cn/mmbiz_png/ibOFjxwickib46pKW3iaXoQmvy1cKCjJdK5RNFop4KSC5uxP6xZWn1l6WGIFbeKvcv1dMCpJZk3hyiapuh2zUmUZYKA/640?wx_fmt=png)
plt.hist(y, bins=10)
![](https://mmbiz.qpic.cn/mmbiz_png/ibOFjxwickib46pKW3iaXoQmvy1cKCjJdK5RkAHZAx2OwWiazozUTfCo90eYTibDicc7n2L9icIcWic9TuWWdTgTsOwibqyA/640?wx_fmt=png)
1. T检验
import pingouin as pg
pg.ttest(x, y)
|
T |
dof |
tail |
p-val |
CI95% |
cohen-d |
BF10 |
power |
T-test |
-4.597628 |
58 |
two-sided |
0.000024 |
[-1.47, -0.58] |
1.187102 |
786.346 |
0.994771 |
2. 皮尔森相关
pg.corr(x, y)
|
n |
r |
CI95% |
r2 |
adj_r2 |
p-val |
BF10 |
power |
pearson |
30 |
0.60149 |
[0.31, 0.79] |
0.36179 |
0.314515 |
0.000439 |
82.116 |
0.955747 |
3.鲁棒检验
#添加一个异常值
x[5] = 18
#使用Shepherd's pi correlation
pg.corr(x, y, method="shepherd")
|
n |
outliers |
r |
CI95% |
r2 |
adj_r2 |
p-val |
power |
shepherd |
30 |
1 |
0.569458 |
[0.26, 0.77] |
0.324283 |
0.274229 |
0.001263 |
0.926066 |
4. 数据正态性检验
pg.normality(x)
|
W |
pval |
normal |
0 |
0.970533 |
0.553863 |
True |
pg.normality(y)
|
W |
pval |
normal |
0 |
0.985161 |
0.939893 |
True |
pg.multivariate_normality(np.column_stack((x, y)))
(True, 0.6257634649268228)
5. Q-Q plot
import numpy as np
import pingouin as pg
np.random.seed(666)
x = np.random.normal(size=50)
ax = pg.qqplot(x, dist='norm')
![](https://mmbiz.qpic.cn/mmbiz_png/ibOFjxwickib46pKW3iaXoQmvy1cKCjJdK5R0MXU9IX6d9s9l6KzlF85bP6BsjXQs5pic7tQ3fQ4S2J0ByuEVZUeQicg/640?wx_fmt=png)
6. 单因素方差分析
# 读取数据
df = pg.read_dataset('mixed_anova')
df.sample(10)
|
Scores |
Time |
Group |
Subject |
142 |
6.502562 |
January |
Meditation |
52 |
55 |
5.355380 |
January |
Control |
25 |
70 |
4.714565 |
June |
Control |
10 |
167 |
6.586494 |
June |
Meditation |
47 |
169 |
7.388138 |
June |
Meditation |
49 |
107 |
5.031982 |
August |
Meditation |
47 |
135 |
4.837971 |
January |
Meditation |
45 |
163 |
5.483801 |
June |
Meditation |
43 |
37 |
5.177205 |
January |
Control |
7 |
4 |
4.779411 |
August |
Control |
4 |
# Run the ANOVA
aov = pg.anova(data=df,
dv='Scores', #因变量
between='Group',
detailed=True)
aov
|
Source |
SS |
DF |
MS |
F |
p-unc |
np2 |
0 |
Group |
5.459963 |
1 |
5.459963 |
5.243656 |
0.0232 |
0.028616 |
1 |
Within |
185.342729 |
178 |
1.041251 |
NaN |
NaN |
NaN |
7. 重复测量方差分析
pg.rm_anova(data=df,
dv='Scores',
within='Time',
subject='Subject',
detailed=True)
|
Source |
SS |
DF |
MS |
F |
p-unc |
np2 |
eps |
0 |
Time |
7.628428 |
2 |
3.814214 |
3.912796 |
0.022629 |
0.062194 |
0.998751 |
1 |
Error |
115.027023 |
118 |
0.974805 |
NaN |
NaN |
NaN |
NaN |
8. 有交互作用的双因素方差分析
# Compute the two-way mixed ANOVA and export to a .csv file
aov = pg.mixed_anova(data=df,
dv='Scores',
between='Group',
within='Time',
subject='Subject',
correction=False,
effsize="np2")
pg.print_table(aov)
=============
ANOVA SUMMARY
=============
Source SS DF1 DF2 MS F p-unc np2 eps
----------- ----- ----- ----- ----- ----- ------- ----- -------
Group 5.460 1 58 5.460 5.052 0.028 0.080 nan
Time 7.628 2 116 3.814 4.027 0.020 0.065 0.999
Interaction 5.167 2 116 2.584 2.728 0.070 0.045 nan
9. 多元线性回归
pg.linear_regression(data[['X', 'Z']], data['Y'])
|
names |
coef |
se |
T |
pval |
r2 |
adj_r2 |
CI[2.5%] |
CI[97.5%] |
0 |
Intercept |
2.916901 |
1.444715 |
2.019015 |
0.053516 |
0.26855 |
0.214368 |
-0.047409 |
5.881210 |
1 |
X |
0.610580 |
0.202261 |
3.018775 |
0.005487 |
0.26855 |
0.214368 |
0.195575 |
1.025584 |
2 |
Z |
-0.007227 |
0.192089 |
-0.037624 |
0.970264 |
0.26855 |
0.214368 |
-0.401361 |
0.386907 |
10. 中介效应分析
pg.mediation_analysis(data=data, x='X', m='Z', y='Y', seed=42, n_boot=1000)
|
path |
coef |
se |
pval |
CI[2.5%] |
CI[97.5%] |
sig |
0 |
Z ~ X |
-0.287032 |
0.191454 |
0.145006 |
-0.679207 |
0.105142 |
No |
1 |
Y ~ Z |
-0.165299 |
0.209888 |
0.437572 |
-0.595235 |
0.264637 |
No |
2 |
Total |
0.612654 |
0.191099 |
0.003354 |
0.221205 |
1.004103 |
Yes |
3 |
Direct |
0.610580 |
0.202261 |
0.005487 |
0.195575 |
1.025584 |
Yes |
4 |
Indirect |
0.002074 |
0.042262 |
0.976000 |
-0.088619 |
0.092009 |
No |
Pingouin与Pandas
pandas.DataFrame可直接使用Pingouin的很多统计方法,例如
import pingouin as pg
# Example 1 | ANOVA
df = pg.read_dataset('mixed_anova')
df.anova(dv='Scores', between='Group', detailed=True)
# Example 2 | Pairwise correlations
data = pg.read_dataset('mediation')
data.pairwise_corr(columns=['X', 'M', 'Y'], covar=['Mbin'])
# Example 3 | Partial correlation matrix
data.pcorr()
pandas.DataFrame支持的pingouin统计方法有:
-
-
-
-
-
-
pingouin.pairwise_ttests()
-
pingouin.pairwise_tukey()
-
-
-
-
-
pingouin.mediation_analysis()