1. 課題を見つける/仮説を立てる
2. 実験や観察をしてデータを集める
3. データを整理する
4. データを解析して仮説を検証する
- 仮説の検証にはエビデンスが必要
- でも生物の表現型には ばらつきがある
- 実験や観察で見られた差や効果は偶然の産物じゃないの?
- これに答えるのが統計解析
- 推論や予測などにも使うよ
※資料作成は岩嵜航さん(東北大学)にご協力いただきました。
(左右キーで進みます!)
Return to HOME
まず、自分が 何を示したいのかを整理する
自分のデータの性質を知り、適切な統計手法を探す
統計手法を実践する方法を探す
割合データは質的変数を変形したもの
モデリング
まず、自分が何を示したいのかを整理する
自分のデータの性質を知り、使えそうな統計手法を探す
統計手法を実践する方法を探す
※帰無仮説と対立仮説は排他的
\(Z\)統計量:\(\frac{\bar{X} - \mu}{\sqrt{\sigma^2/n}}\)…平均値の差を標準誤差\({\sqrt{\sigma^2/n}}\)で標準化
\(t\)統計量:\(\frac{\bar{X} - \mu}{\sqrt{\hat{\sigma}^2/n}}\)…母分散\(\sigma^2\)の代わりに不偏分散\(\hat{\sigma}^2\)を使う
他にも\(F\)統計量やカイ二乗統計量など
不適切な統計量(=方法)を用いると誤った\(p\)値が得られる
library(palmerpenguins) #penguinsを読み込み library(tidyverse) #tidyverseを読み込み head(penguins) #penguinsの最初を見せて!
## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen NA NA NA NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 ## 6 Adelie Torgersen 39.3 20.6 190 3650 ## # ℹ 2 more variables: sex <fct>, year <int>
penguins_adelie = penguins %>% filter(species == "Adelie") head(penguins_adelie)
## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen NA NA NA NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 ## 6 Adelie Torgersen 39.3 20.6 190 3650 ## # ℹ 2 more variables: sex <fct>, year <int>
gp = ggplot(data = penguins_adelie) + geom_point(aes(x = sex, y = body_mass_g), position = position_jitter(width = 0.05)) gp
## Warning: Removed 1 row containing missing values or values outside the scale range ## (`geom_point()`).
penguins_adelie = penguins %>% filter(species == "Adelie", !is.na(sex)) #NAを除く head(penguins_adelie)
## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen 36.7 19.3 193 3450 ## 5 Adelie Torgersen 39.3 20.6 190 3650 ## 6 Adelie Torgersen 38.9 17.8 181 3625 ## # ℹ 2 more variables: sex <fct>, year <int>
gp = ggplot(data = penguins_adelie) + geom_point(aes(x = sex, y = body_mass_g), position = position_jitter(width = 0.05)) gp
gp = ggplot(data = penguins_adelie) + stat_qq(aes(sample = body_mass_g, color = sex)) + facet_wrap( ~ sex, scales = "free") gp
sw_test_results = penguins_adelie %>% #penguins_adelieを使う group_by(sex) %>% #sexでグループ summarise(statistic = shapiro.test(body_mass_g)$statistic, #body_mass_gにshapiro.testを行い、そのstatistic値をstatisticという列に p.value = shapiro.test(body_mass_g)$p.value, #同様 .groups = "drop") #グループを解除 head(sw_test_results)
## # A tibble: 2 × 3 ## sex statistic p.value ## <fct> <dbl> <dbl> ## 1 female 0.977 0.199 ## 2 male 0.983 0.416
var.test(data = penguins_adelie, body_mass_g ~ sex)
## ## F test to compare two variances ## ## data: body_mass_g by sex ## F = 0.60331, num df = 72, denom df = 72, p-value = 0.03356 ## alternative hypothesis: true ratio of variances is not equal to 1 ## 95 percent confidence interval: ## 0.3787186 0.9611059 ## sample estimates: ## ratio of variances ## 0.6033147
t.test(data = penguins_adelie, body_mass_g ~ sex)
## ## Welch Two Sample t-test ## ## data: body_mass_g by sex ## t = -13.126, df = 135.69, p-value < 2.2e-16 ## alternative hypothesis: true difference in means between group female and group male is not equal to 0 ## 95 percent confidence interval: ## -776.3012 -573.0139 ## sample estimates: ## mean in group female mean in group male ## 3368.836 4043.493
gp = ggplot(data = penguins_adelie) + geom_point(aes(x = sex, y = body_mass_g), size = 0.8, position = position_jitter(width = 0.07), alpha = 0.5) + annotate("text", x = 1.5, y = 5200, label = "***", size = 5, color = "gray30") + annotate("segment", x = 1.2, xend = 1.8, y = 5000, yend = 5000, color = "gray30") + coord_cartesian(ylim = c(0, 6000)) gp
ようやくグラフが完成!
#install.packages("easystats") library(easystats) report(t.test(data = penguins_adelie, body_mass_g ~ sex))
## Effect sizes were labelled following Cohen's (1988) recommendations. ## ## The Welch Two Sample t-test testing the difference of body_mass_g by sex (mean ## in group female = 3368.84, mean in group male = 4043.49) suggests that the ## effect is negative, statistically significant, and large (difference = -674.66, ## 95% CI [-776.30, -573.01], t(135.69) = -13.13, p < .001; Cohen's d = -2.25, 95% ## CI [-2.68, -1.82])
report(t.test(data = penguins_adelie, body_mass_g ~ sex)) %>% as.data.frame %>% #データフレームの形にして write_csv(file = "result_ttest.csv") %>% #csvに書き出し print() #表示
## Warning: Unable to retrieve data from htest object. ## Returning an approximate effect size using t_to_d().
## Welch Two Sample t-test ## ## Parameter | Group | Mean_Group1 | Mean_Group2 | Difference | 95% CI | t(135.69) | p | d | d CI ## ------------------------------------------------------------------------------------------------------------------------------- ## body_mass_g | sex | 3368.84 | 4043.49 | -674.66 | [-776.30, -573.01] | -13.13 | < .001 | -2.25 | [-2.68, -1.82] ## ## Alternative hypothesis: two.sided
penguins_adelie = penguins %>% filter(species == "Adelie", !is.na(sex)) %>% #NAを除く mutate(ratio = bill_length_mm/bill_depth_mm) head(penguins_adelie)
## # A tibble: 6 × 9 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen 36.7 19.3 193 3450 ## 5 Adelie Torgersen 39.3 20.6 190 3650 ## 6 Adelie Torgersen 38.9 17.8 181 3625 ## # ℹ 3 more variables: sex <fct>, year <int>, ratio <dbl>
gp = ggplot(data = penguins_adelie) + geom_point(aes(x = sex, y = ratio), position = position_jitter(width = 0.05)) gp
gp = ggplot(data = penguins_adelie) + stat_qq(aes(sample = ratio, color = sex)) + facet_wrap( ~ sex, scales = "free") gp
sw_test_results = penguins_adelie %>% #penguins_adelieを使う group_by(sex) %>% #sexでグループ summarise(statistic = shapiro.test(ratio)$statistic, p.value = shapiro.test(ratio)$p.value, .groups = "drop") head(sw_test_results)
## # A tibble: 2 × 3 ## sex statistic p.value ## <fct> <dbl> <dbl> ## 1 female 0.982 0.397 ## 2 male 0.984 0.480
var.test(data = penguins_adelie, ratio ~ sex)
## ## F test to compare two variances ## ## data: ratio by sex ## F = 0.78018, num df = 72, denom df = 72, p-value = 0.2946 ## alternative hypothesis: true ratio of variances is not equal to 1 ## 95 percent confidence interval: ## 0.4897412 1.2428573 ## sample estimates: ## ratio of variances ## 0.7801785
まずインストールする(1行目の#を消して走らせる)
#install.packages("coin") library(coin) # 読み込み wilcox_test(data = penguins_adelie, ratio ~ sex, distribution = "exact")
## ## Exact Wilcoxon-Mann-Whitney Test ## ## data: ratio by sex (female, male) ## Z = -0.29941, p-value = 0.7661 ## alternative hypothesis: true mu is not equal to 0
まずインストールする(1行目の#を消して走らせる)
#install.packages("brunnermunzel") library(brunnermunzel) # 読み込み brunnermunzel.test(data = penguins_adelie, ratio ~ sex)
## ## Brunner-Munzel Test ## ## data: ratio by sex ## Brunner-Munzel Test Statistic = 0.29701, df = 143.58, p-value = 0.7669 ## 95 percent confidence interval: ## 0.4188189 0.6098919 ## sample estimates: ## P(X<Y)+.5*P(X=Y) ## 0.5143554
library(palmerpenguins) #penguinsを読み込み library(tidyverse) #tidyverseを読み込み head(penguins) #penguinsの最初を見せて!
## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen NA NA NA NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 ## 6 Adelie Torgersen 39.3 20.6 190 3650 ## # ℹ 2 more variables: sex <fct>, year <int>
penguins_female = penguins %>% filter(sex == "female") head(penguins_female)
## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.5 17.4 186 3800 ## 2 Adelie Torgersen 40.3 18 195 3250 ## 3 Adelie Torgersen 36.7 19.3 193 3450 ## 4 Adelie Torgersen 38.9 17.8 181 3625 ## 5 Adelie Torgersen 41.1 17.6 182 3200 ## 6 Adelie Torgersen 36.6 17.8 185 3700 ## # ℹ 2 more variables: sex <fct>, year <int>
gp = ggplot(data = penguins_female) + geom_point(aes(x = species, y = body_mass_g), position = position_jitter(width = 0.05)) gp
gp = ggplot(data = penguins_female) + stat_qq(aes(sample = body_mass_g, color = species)) + facet_wrap( ~ species, scales = "free") gp
sw_test_results = penguins_female %>% #penguins_adelieを使う group_by(species) %>% #speciesでグループ summarise(statistic = shapiro.test(body_mass_g)$statistic, #body_mass_gにshapiro.testを行い、そのstatistic値をstatisticという列に p.value = shapiro.test(body_mass_g)$p.value, #同様 .groups = "drop") #グループを解除 head(sw_test_results)
## # A tibble: 3 × 3 ## species statistic p.value ## <fct> <dbl> <dbl> ## 1 Adelie 0.977 0.199 ## 2 Chinstrap 0.963 0.306 ## 3 Gentoo 0.981 0.511
bartlett.test(data = penguins_female, body_mass_g ~ species)
## ## Bartlett test of homogeneity of variances ## ## data: body_mass_g by species ## Bartlett's K-squared = 0.19828, df = 2, p-value = 0.9056
summary(aov(data = penguins_female, body_mass_g ~ species))
## Df Sum Sq Mean Sq F value Pr(>F) ## species 2 60350016 30175008 393.2 <2e-16 *** ## Residuals 162 12430757 76733 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(data = penguins_female, body_mass_g ~ species))
## Tukey multiple comparisons of means ## 95% family-wise confidence level ## ## Fit: aov(formula = body_mass_g ~ species, data = penguins_female) ## ## $species ## diff lwr upr p adj ## Chinstrap-Adelie 158.3703 22.32078 294.4197 0.0179471 ## Gentoo-Adelie 1310.9058 1195.64908 1426.1624 0.0000000 ## Gentoo-Chinstrap 1152.5355 1011.00620 1294.0648 0.0000000
gp = ggplot(data = penguins) + geom_point(aes(x = species, y = body_mass_g), size = 0.5, position = position_jitter(width = 0.05), alpha = 0.3) + annotate("text", x = c(1, 2, 3), y = 7000, label = c("a", "b", "c")) + coord_cartesian(ylim = c(0, 8000)) gp
## Warning: Removed 2 rows containing missing values or values outside the scale range ## (`geom_point()`).
できあがり!
#install.packages("ARTool") #インストールする(#を外す) library(ARTool) #おまじない model = art(data = penguins_female, body_mass_g ~ species) summary(model)
## Aligned Rank Transform of Factorial Model ## ## Call: ## art(formula = body_mass_g ~ species, data = penguins_female) ## ## Column sums of aligned responses (should all be ~0): ## species ## 0 ## ## F values of ANOVAs on aligned responses not of interest (should all be ~0): ## Min. 1st Qu. Median Mean 3rd Qu. Max. ##
anova(model)
## Analysis of Variance of Aligned Rank Transformed Data ## ## Table Type: Anova Table (Type III tests) ## Model: No Repeated Measures (lm) ## Response: art(body_mass_g) ## ## Df Df.res F value Pr(>F) ## 1 species 2 162 191.44 < 2.22e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
art.con(model, "species")
## contrast estimate SE df t.ratio p.value ## Adelie - Chinstrap -17.6 5.44 162 -3.240 0.0041 ## Adelie - Gentoo -88.0 4.61 162 -19.104 <.0001 ## Chinstrap - Gentoo -70.4 5.66 162 -12.443 <.0001 ## ## P value adjustment: tukey method for comparing a family of 3 estimates
penguins_sexratio_sum = penguins %>% group_by(species, sex) %>% # speciesとsexごとにグループ分け summarise(length = length(species), # N数を数える .groups = "drop") %>% # groupを解除 filter(!is.na(sex)) %>% # sex == NAを除去 print() # 表示
## # A tibble: 6 × 3 ## species sex length ## <fct> <fct> <int> ## 1 Adelie female 73 ## 2 Adelie male 73 ## 3 Chinstrap female 34 ## 4 Chinstrap male 34 ## 5 Gentoo female 58 ## 6 Gentoo male 61
gp = ggplot(data = penguins_sexratio_sum, aes(x = species, y = length)) + geom_bar(aes(fill = sex), stat = "identity", position = "fill") + scale_y_continuous(labels = scales::percent) + coord_flip() # X軸とY軸を逆にする gp
penguins_sexratio = penguins %>% filter(!is.na(sex)) # NAを除く chisq.test(penguins_sexratio$sex, penguins_sexratio$species)
## ## Pearson's Chi-squared test ## ## data: penguins_sexratio$sex and penguins_sexratio$species ## X-squared = 0.048607, df = 2, p-value = 0.976
penguins_sexratio_sum_cross = penguins_sexratio_sum %>% pivot_wider(names_from = sex, values_from = length)%>% #横長に tibble::column_to_rownames(var = "species") %>% #speciesを行名に as.matrix() #行列に変換 penguins_sexratio_sum_cross
## female male ## Adelie 73 73 ## Chinstrap 34 34 ## Gentoo 58 61
chisq.test(penguins_sexratio_sum_cross)
## ## Pearson's Chi-squared test ## ## data: penguins_sexratio_sum_cross ## X-squared = 0.048607, df = 2, p-value = 0.976
まず、自分が何を示したいのかを整理する
自分のデータの性質を知り、適切な統計手法を探す
統計手法を実践する方法を探す
Coded by Heavy Watal
library(palmerpenguins) #penguinsを読み込み library(tidyverse) #tidyverseを読み込み head(penguins) #penguinsの最初を見せて!
## # A tibble: 6 × 8 ## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g ## <fct> <fct> <dbl> <dbl> <int> <int> ## 1 Adelie Torgersen 39.1 18.7 181 3750 ## 2 Adelie Torgersen 39.5 17.4 186 3800 ## 3 Adelie Torgersen 40.3 18 195 3250 ## 4 Adelie Torgersen NA NA NA NA ## 5 Adelie Torgersen 36.7 19.3 193 3450 ## 6 Adelie Torgersen 39.3 20.6 190 3650 ## # ℹ 2 more variables: sex <fct>, year <int>
gp = ggplot(data = penguins_adelie) + geom_point(aes(x = body_mass_g, y = flipper_length_mm)) gp
glm_result = glm(flipper_length_mm ~ body_mass_g, #線形予測子 family = gaussian(link = "identity"), #誤差分布とリンク関数 data = penguins_adelie) #データ summary(glm_result) #結果を格納したglm_resultの詳細を表示する
## ## Call: ## glm(formula = flipper_length_mm ~ body_mass_g, family = gaussian(link = "identity"), ## data = penguins_adelie) ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.656e+02 3.918e+00 42.27 < 2e-16 *** ## body_mass_g 6.610e-03 1.049e-03 6.30 3.4e-09 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for gaussian family taken to be 33.57457) ## ## Null deviance: 6167.5 on 145 degrees of freedom ## Residual deviance: 4834.7 on 144 degrees of freedom ## AIC: 931.33 ## ## Number of Fisher Scoring iterations: 2
report(glm_result) %>% as.data.frame() %>% #データフレーム型にする write_csv(file = "result_glm.csv") %>% # CSVで出力 print() #ここでも表示
## Parameter | Coefficient | 95% CI | t(144) | p | Std. Coef. | Std. Coef. 95% CI | Fit ## -------------------------------------------------------------------------------------------------------- ## (Intercept) | 165.60 | [157.92, 173.28] | 42.27 | < .001 | -1.93e-15 | [-0.14, 0.14] | ## body mass g | 6.61e-03 | [ 0.00, 0.01] | 6.30 | < .001 | 0.46 | [ 0.32, 0.61] | ## | | | | | | | ## AIC | | | | | | | 931.33 ## AICc | | | | | | | 931.50 ## BIC | | | | | | | 940.28 ## R2 | | | | | | | 0.22 ## Sigma | | | | | | | 5.79
#install.packages("ggeffects") library(ggeffects) #予測値をglm_predictという変数に格納 glm_predict = ggpredict(glm_result, terms = "body_mass_g") #termsにはX軸の列名を入れる glm_predict %>% as_tibble() #中身はこんなデータフレーム
## # A tibble: 21 × 6 ## x predicted std.error conf.low conf.high group ## <int> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 2800 184. 1.06 182. 186. 1 ## 2 2900 185. 0.972 183. 187. 1 ## 3 3000 185. 0.883 184. 187. 1 ## 4 3100 186. 0.797 185. 188. 1 ## 5 3200 187. 0.716 185. 188. 1 ## 6 3300 187. 0.642 186. 189. 1 ## 7 3400 188. 0.577 187. 189. 1 ## 8 3500 189. 0.526 188. 190. 1 ## 9 3600 189. 0.492 188. 190. 1 ## 10 3700 190. 0.480 189. 191. 1 ## # ℹ 11 more rows
plot(glm_predict, show_data = T)
gp = ggplot() + geom_point(data = penguins_adelie, aes(x = body_mass_g, y = flipper_length_mm), alpha = 0.5) + geom_line(data = glm_predict, aes(x = x, y = predicted)) + geom_ribbon(data = glm_predict, aes(x = x, ymin = conf.low, ymax = conf.high), alpha = 0.2) + annotate("text", x = 4200, y = 210, label = "(Y ~ 0.006677X + 165.2, p = 3.4e-09)", size = 4) gp
Coded by Heavy Watal
shapiro.test(glm_result$residuals) #残差の正規性をShapiro検定
## ## Shapiro-Wilk normality test ## ## data: glm_result$residuals ## W = 0.99179, p-value = 0.562
Coded by Heavy Watal
Coded by Heavy Watal
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. ## ℹ Please use `linewidth` instead. ## This warning is displayed once every 8 hours. ## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was ## generated.
\(Prob(X = k \mid \lambda) = \frac {\lambda^k e^{-\lambda}} {k!}\)
Coded by Heavy Watal
seed = read_csv("https://kuboweb.github.io/-kubo/stat/iwanamibook/fig/poisson/data3a.csv") %>% rename(no.seed = y, plant_size = x, treatment = f) head(seed)
## # A tibble: 6 × 3 ## no.seed plant_size treatment ## <dbl> <dbl> <chr> ## 1 6 8.31 C ## 2 6 9.44 C ## 3 6 9.5 C ## 4 12 9.07 C ## 5 10 10.2 C ## 6 4 8.32 C
gp = ggplot(data = seed) + geom_point(aes(x =plant_size , y = no.seed)) gp
glm_result = glm(no.seed ~ plant_size, family = poisson(link = "log"), #ポアソン回帰の設定 data = seed) summary(glm_result)
## ## Call: ## glm(formula = no.seed ~ plant_size, family = poisson(link = "log"), ## data = seed) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 1.29172 0.36369 3.552 0.000383 *** ## plant_size 0.07566 0.03560 2.125 0.033580 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for poisson family taken to be 1) ## ## Null deviance: 89.507 on 99 degrees of freedom ## Residual deviance: 84.993 on 98 degrees of freedom ## AIC: 474.77 ## ## Number of Fisher Scoring iterations: 4
\(\beta_0\): 1.29172 (Pr(>|t|)=0.000383)
\(\beta_1\): 0.07566 (Pr(>|t|)=0.033580)
種子数への植物サイズの効果は有意である
\(Y = e^{(1.29172 + 0.07566 X)}\)に回帰できる
glm_predict = ggpredict(glm_result, terms = "plant_size") plot(glm_predict, show_data = T)
## Data points may overlap. Use the `jitter` argument to add some amount of ## random variation to the location of data points and avoid overplotting.
gp = ggplot() + geom_point(data = seed, aes(x = plant_size , y = no.seed)) + geom_line(data = glm_predict, aes(x = x , y = predicted)) + geom_ribbon(data = glm_predict, aes(x = x , ymin = conf.low, ymax = conf.high), alpha = 0.2) gp
Coded by Heavy Watal
\({Prob}(X = k \mid n,~p) = \binom n k p^k (1 - p)^{n - k}\)
Coded by Heavy Watal
seed2= read_csv("https://kuboweb.github.io/-kubo/stat/iwanamibook/fig/binomial/data4a.csv") %>% rename(observed_seed = N, survived_seed = y, plant_size = x, treatment = f) head(seed2)
## # A tibble: 6 × 4 ## observed_seed survived_seed plant_size treatment ## <dbl> <dbl> <dbl> <chr> ## 1 8 1 9.76 C ## 2 8 6 10.5 C ## 3 8 5 10.8 C ## 4 8 6 10.9 C ## 5 8 1 9.37 C ## 6 8 1 8.81 C
gp = ggplot(data = seed2) + geom_point(aes(x = plant_size , y = survived_seed, color = treatment)) gp
seed3 = seed2 %>% mutate(dead_seed = observed_seed-survived_seed) head(seed3)
## # A tibble: 6 × 5 ## observed_seed survived_seed plant_size treatment dead_seed ## <dbl> <dbl> <dbl> <chr> <dbl> ## 1 8 1 9.76 C 7 ## 2 8 6 10.5 C 2 ## 3 8 5 10.8 C 3 ## 4 8 6 10.9 C 2 ## 5 8 1 9.37 C 7 ## 6 8 1 8.81 C 7
glm_result = glm(cbind(survived_seed, dead_seed) ~ plant_size, family = binomial(link = "logit"), #ロジスティック回帰の設定 data = seed3) summary(glm_result)
## ## Call: ## glm(formula = cbind(survived_seed, dead_seed) ~ plant_size, family = binomial(link = "logit"), ## data = seed3) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -13.7785 1.0773 -12.79 <2e-16 *** ## plant_size 1.4626 0.1107 13.21 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 499.23 on 99 degrees of freedom ## Residual deviance: 217.17 on 98 degrees of freedom ## AIC: 364.35 ## ## Number of Fisher Scoring iterations: 5
glm_result = glm(cbind(survived_seed, dead_seed) ~ plant_size + treatment, #重回帰 family = binomial(link = "logit"), #ロジスティック回帰の設定 data = seed3) summary(glm_result)
## ## Call: ## glm(formula = cbind(survived_seed, dead_seed) ~ plant_size + ## treatment, family = binomial(link = "logit"), data = seed3) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) -19.5361 1.4138 -13.82 <2e-16 *** ## plant_size 1.9524 0.1389 14.06 <2e-16 *** ## treatmentT 2.0215 0.2313 8.74 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 499.23 on 99 degrees of freedom ## Residual deviance: 123.03 on 97 degrees of freedom ## AIC: 272.21 ## ## Number of Fisher Scoring iterations: 5
glm_predict = ggpredict(glm_result, terms = c("plant_size", "treatment")) glm_predict %>% as_tibble()
## # A tibble: 22 × 6 ## x predicted std.error conf.low conf.high group ## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 7.5 0.00744 0.389 0.00349 0.0158 C ## 2 7.5 0.0536 0.276 0.0319 0.0887 T ## 3 8 0.0195 0.325 0.0104 0.0362 C ## 4 8 0.131 0.220 0.0890 0.188 T ## 5 8.5 0.0502 0.263 0.0306 0.0812 C ## 6 8.5 0.285 0.172 0.222 0.358 T ## 7 9 0.123 0.206 0.0857 0.173 C ## 8 9 0.514 0.144 0.444 0.584 T ## 9 9.5 0.271 0.159 0.214 0.337 C ## 10 9.5 0.738 0.146 0.679 0.789 T ## # ℹ 12 more rows
plot(glm_predict, show_data = T)
## Raw data not available.
gp = ggplot(data = seed2) + geom_point(aes(x = plant_size , y = survived_seed/observed_seed, color = treatment)) + geom_line(data = glm_predict, aes(x = x , y = predicted, color = group)) + geom_ribbon(data = glm_predict, aes(x = x , ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.2) gp
#install.packages("MASS") #まずインストール library(MASS) #まず読み込み head(birthwt, n = 2) #MASSに入っているbirthwtというサンプルデータを使う
## low age lwt race smoke ptl ht ui ftv bwt ## 85 0 19 182 2 0 0 0 1 0 2523 ## 86 0 33 155 3 0 0 0 0 3 2551
gp = ggplot(data = birthwt, aes(x = lwt , y = low)) + geom_point() gp
glm_result = glm(low ~ lwt, family = binomial(link = "logit"), #ロジスティック回帰の設定 data = birthwt) summary(glm_result)
## ## Call: ## glm(formula = low ~ lwt, family = binomial(link = "logit"), data = birthwt) ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) ## (Intercept) 0.99831 0.78529 1.271 0.2036 ## lwt -0.01406 0.00617 -2.279 0.0227 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## (Dispersion parameter for binomial family taken to be 1) ## ## Null deviance: 234.67 on 188 degrees of freedom ## Residual deviance: 228.69 on 187 degrees of freedom ## AIC: 232.69 ## ## Number of Fisher Scoring iterations: 4
glm_predict = ggpredict(glm_result, terms = "lwt[all]") plot(glm_predict, show_data = T)
## Data points may overlap. Use the `jitter` argument to add some amount of ## random variation to the location of data points and avoid overplotting.
gp = ggplot() + geom_point(data = birthwt, aes(x = lwt , y = low)) + geom_line(data = glm_predict, aes(x = x , y = predicted)) + geom_ribbon(data = glm_predict, aes(x = x , ymin = conf.low, ymax = conf.high), alpha = 0.2) gp
正規分布(gaussian)…確率変数の和、平均値
二項分布(binomial)…成功率p、試行回数nのうちの成功回数
ポアソン分布(poisson)…単位時間あたり平均λ回起こる事象の発生回数
ガンマ分布(Gamma)…ポアソン過程でk回起こるまでの待ち時間
他にinverse.gaussian, quasi, quasibinomial, quasipoisson.
それぞれの分布やリンク関数の形を調べて選ぼう