#!/usr/bin/env python

# DO NOT EDIT
# Autogenerated from the notebook metaanalysis1.ipynb.
# Edit the notebook and then sync the output with this file.
#
# flake8: noqa
# DO NOT EDIT

# # Meta-Analysis in statsmodels
#
# Statsmodels include basic methods for meta-analysis. This notebook
# illustrates the current usage.
#
# Status: The results have been verified against R meta and metafor
# packages. However, the API is still experimental and will still change.
# Some options for additional methods that are available in R meta and
# metafor are missing.
#
# The support for meta-analysis has 3 parts:
#
# - effect size functions: this currently includes
#   ``effectsize_smd`` computes effect size and their standard errors for
# standardized mean difference,
#   ``effectsize_2proportions`` computes effect sizes for comparing two
# independent proportions using risk difference, (log) risk ratio, (log)
# odds-ratio or arcsine square root transformation
# - The `combine_effects` computes fixed and random effects estimate for
# the overall mean or effect. The returned results instance includes a
# forest plot function.
# - helper functions to estimate the random effect variance, tau-squared
#
# The estimate of the overall effect size in `combine_effects` can also be
# performed using WLS or GLM with var_weights.
#
# Finally, the meta-analysis functions currently do not include the
# Mantel-Hanszel method. However, the fixed effects results can be computed
# directly using `StratifiedTable` as illustrated below.

import numpy as np
import pandas as pd
from scipy import stats, optimize

from statsmodels.regression.linear_model import WLS
from statsmodels.genmod.generalized_linear_model import GLM

from statsmodels.stats.meta_analysis import (
    effectsize_smd,
    effectsize_2proportions,
    combine_effects,
    _fit_tau_iterative,
    _fit_tau_mm,
    _fit_tau_iter_mm,
)

# increase line length for pandas
pd.set_option("display.width", 100)

# ## Example

data = [
    ["Carroll", 94, 22, 60, 92, 20, 60],
    ["Grant", 98, 21, 65, 92, 22, 65],
    ["Peck", 98, 28, 40, 88, 26, 40],
    ["Donat", 94, 19, 200, 82, 17, 200],
    ["Stewart", 98, 21, 50, 88, 22, 45],
    ["Young", 96, 21, 85, 92, 22, 85],
]
colnames = ["study", "mean_t", "sd_t", "n_t", "mean_c", "sd_c", "n_c"]
rownames = [i[0] for i in data]
dframe1 = pd.DataFrame(data, columns=colnames)
rownames

mean2, sd2, nobs2, mean1, sd1, nobs1 = np.asarray(
    dframe1[["mean_t", "sd_t", "n_t", "mean_c", "sd_c", "n_c"]]).T
rownames = dframe1["study"]
rownames.tolist()

np.array(nobs1 + nobs2)

# ### estimate effect size standardized mean difference

eff, var_eff = effectsize_smd(mean2, sd2, nobs2, mean1, sd1, nobs1)

# ### Using one-step chi2, DerSimonian-Laird estimate for random effects
# variance tau
#
# Method option for random effect `method_re="chi2"` or `method_re="dl"`,
# both names are accepted.
# This is commonly referred to as the DerSimonian-Laird method, it is
# based on a moment estimator based on pearson chi2 from the fixed effects
# estimate.

res3 = combine_effects(eff,
                       var_eff,
                       method_re="chi2",
                       use_t=True,
                       row_names=rownames)
# TODO: we still need better information about conf_int of individual
# samples
# We don't have enough information in the model for individual confidence
# intervals
# if those are not based on normal distribution.
res3.conf_int_samples(nobs=np.array(nobs1 + nobs2))
print(res3.summary_frame())

res3.cache_ci

res3.method_re

fig = res3.plot_forest()
fig.set_figheight(6)
fig.set_figwidth(6)

res3 = combine_effects(eff,
                       var_eff,
                       method_re="chi2",
                       use_t=False,
                       row_names=rownames)
# TODO: we still need better information about conf_int of individual
# samples
# We don't have enough information in the model for individual confidence
# intervals
# if those are not based on normal distribution.
res3.conf_int_samples(nobs=np.array(nobs1 + nobs2))
print(res3.summary_frame())

# ### Using iterated, Paule-Mandel estimate for random effects variance
# tau
#
# The method commonly referred to as Paule-Mandel estimate is a method of
# moment estimate for the random effects variance that iterates between mean
# and variance estimate until convergence.
#

res4 = combine_effects(eff,
                       var_eff,
                       method_re="iterated",
                       use_t=False,
                       row_names=rownames)
res4_df = res4.summary_frame()
print("method RE:", res4.method_re)
print(res4.summary_frame())
fig = res4.plot_forest()

# ## Example Kacker interlaboratory mean
#
# In this example the effect size is the mean of measurements in a lab. We
# combine the estimates from several labs to estimate and overall average.

eff = np.array([61.00, 61.40, 62.21, 62.30, 62.34, 62.60, 62.70, 62.84, 65.90])
var_eff = np.array(
    [0.2025, 1.2100, 0.0900, 0.2025, 0.3844, 0.5625, 0.0676, 0.0225, 1.8225])
rownames = ["PTB", "NMi", "NIMC", "KRISS", "LGC", "NRC", "IRMM", "NIST", "LNE"]

res2_DL = combine_effects(eff,
                          var_eff,
                          method_re="dl",
                          use_t=True,
                          row_names=rownames)
print("method RE:", res2_DL.method_re)
print(res2_DL.summary_frame())
fig = res2_DL.plot_forest()
fig.set_figheight(6)
fig.set_figwidth(6)

res2_PM = combine_effects(eff,
                          var_eff,
                          method_re="pm",
                          use_t=True,
                          row_names=rownames)
print("method RE:", res2_PM.method_re)
print(res2_PM.summary_frame())
fig = res2_PM.plot_forest()
fig.set_figheight(6)
fig.set_figwidth(6)

# ## Meta-analysis of proportions
#
# In the following example the random effect variance tau is estimated to
# be zero.
# I then change two counts in the data, so the second example has random
# effects variance greater than zero.

import io

ss = """    study,nei,nci,e1i,c1i,e2i,c2i,e3i,c3i,e4i,c4i
    1,19,22,16.0,20.0,11,12,4.0,8.0,4,3
    2,34,35,22.0,22.0,18,12,15.0,8.0,15,6
    3,72,68,44.0,40.0,21,15,10.0,3.0,3,0
    4,22,20,19.0,12.0,14,5,5.0,4.0,2,3
    5,70,32,62.0,27.0,42,13,26.0,6.0,15,5
    6,183,94,130.0,65.0,80,33,47.0,14.0,30,11
    7,26,50,24.0,30.0,13,18,5.0,10.0,3,9
    8,61,55,51.0,44.0,37,30,19.0,19.0,11,15
    9,36,25,30.0,17.0,23,12,13.0,4.0,10,4
    10,45,35,43.0,35.0,19,14,8.0,4.0,6,0
    11,246,208,169.0,139.0,106,76,67.0,42.0,51,35
    12,386,141,279.0,97.0,170,46,97.0,21.0,73,8
    13,59,32,56.0,30.0,34,17,21.0,9.0,20,7
    14,45,15,42.0,10.0,18,3,9.0,1.0,9,1
    15,14,18,14.0,18.0,13,14,12.0,13.0,9,12
    16,26,19,21.0,15.0,12,10,6.0,4.0,5,1
    17,74,75,,,42,40,,,23,30"""
df3 = pd.read_csv(io.StringIO(ss))
df_12y = df3[["e2i", "nei", "c2i", "nci"]]
# TODO: currently 1 is reference, switch labels
count1, nobs1, count2, nobs2 = df_12y.values.T
dta = df_12y.values.T

eff, var_eff = effectsize_2proportions(*dta, statistic="rd")

eff, var_eff

res5 = combine_effects(eff, var_eff, method_re="iterated",
                       use_t=False)  # , row_names=rownames)
res5_df = res5.summary_frame()
print("method RE:", res5.method_re)
print("RE variance tau2:", res5.tau2)
print(res5.summary_frame())
fig = res5.plot_forest()
fig.set_figheight(8)
fig.set_figwidth(6)

# ### changing data to have positive random effects variance

dta_c = dta.copy()
dta_c.T[0, 0] = 18
dta_c.T[1, 0] = 22
dta_c.T

eff, var_eff = effectsize_2proportions(*dta_c, statistic="rd")
res5 = combine_effects(eff, var_eff, method_re="iterated",
                       use_t=False)  # , row_names=rownames)
res5_df = res5.summary_frame()
print("method RE:", res5.method_re)
print(res5.summary_frame())
fig = res5.plot_forest()
fig.set_figheight(8)
fig.set_figwidth(6)

res5 = combine_effects(eff, var_eff, method_re="chi2", use_t=False)
res5_df = res5.summary_frame()
print("method RE:", res5.method_re)
print(res5.summary_frame())
fig = res5.plot_forest()
fig.set_figheight(8)
fig.set_figwidth(6)

# ### Replicate fixed effect analysis using GLM with var_weights
#
# `combine_effects` computes weighted average estimates which can be
# replicated using GLM with var_weights or with WLS.
# The `scale` option in `GLM.fit` can be used to replicate fixed meta-
# analysis with fixed and with HKSJ/WLS scale

from statsmodels.genmod.generalized_linear_model import GLM

eff, var_eff = effectsize_2proportions(*dta_c, statistic="or")
res = combine_effects(eff, var_eff, method_re="chi2", use_t=False)
res_frame = res.summary_frame()
print(res_frame.iloc[-4:])

# We need to fix scale=1 in order to replicate standard errors for the
# usual meta-analysis.

weights = 1 / var_eff
mod_glm = GLM(eff, np.ones(len(eff)), var_weights=weights)
res_glm = mod_glm.fit(scale=1.0)
print(res_glm.summary().tables[1])

# check results
res_glm.scale, res_glm.conf_int() - res_frame.loc["fixed effect",
                                                  ["ci_low", "ci_upp"]].values

# Using HKSJ variance adjustment in meta-analysis is equivalent to
# estimating the scale using pearson chi2, which is also the default for the
# gaussian family.

res_glm = mod_glm.fit(scale="x2")
print(res_glm.summary().tables[1])

# check results
res_glm.scale, res_glm.conf_int() - res_frame.loc["fixed effect",
                                                  ["ci_low", "ci_upp"]].values

# ### Mantel-Hanszel odds-ratio using contingency tables
#
# The fixed effect for the log-odds-ratio using the Mantel-Hanszel can be
# directly computed using StratifiedTable.
#
# We need to create a 2 x 2 x k contingency table to be used with
# `StratifiedTable`.

t, nt, c, nc = dta_c
counts = np.column_stack([t, nt - t, c, nc - c])
ctables = counts.T.reshape(2, 2, -1)
ctables[:, :, 0]

counts[0]

dta_c.T[0]

import statsmodels.stats.api as smstats

st = smstats.StratifiedTable(ctables.astype(np.float64))

# compare pooled log-odds-ratio and standard error to R meta package

st.logodds_pooled, st.logodds_pooled - 0.4428186730553189  # R meta

st.logodds_pooled_se, st.logodds_pooled_se - 0.08928560091027186  # R meta

st.logodds_pooled_confint()

print(st.test_equal_odds())

print(st.test_null_odds())

# check conversion to stratified contingency table
#
# Row sums of each table are the sample sizes for treatment and control
# experiments

ctables.sum(1)

nt, nc

# **Results from R meta package**
#
# ```
# > res_mb_hk = metabin(e2i, nei, c2i, nci, data=dat2, sm="OR",
# Q.Cochrane=FALSE, method="MH", method.tau="DL", hakn=FALSE,
# backtransf=FALSE)
# > res_mb_hk
#      logOR            95%-CI %W(fixed) %W(random)
# 1   2.7081 [ 0.5265; 4.8896]       0.3        0.7
# 2   1.2567 [ 0.2658; 2.2476]       2.1        3.2
# 3   0.3749 [-0.3911; 1.1410]       5.4        5.4
# 4   1.6582 [ 0.3245; 2.9920]       0.9        1.8
# 5   0.7850 [-0.0673; 1.6372]       3.5        4.4
# 6   0.3617 [-0.1528; 0.8762]      12.1       11.8
# 7   0.5754 [-0.3861; 1.5368]       3.0        3.4
# 8   0.2505 [-0.4881; 0.9892]       6.1        5.8
# 9   0.6506 [-0.3877; 1.6889]       2.5        3.0
# 10  0.0918 [-0.8067; 0.9903]       4.5        3.9
# 11  0.2739 [-0.1047; 0.6525]      23.1       21.4
# 12  0.4858 [ 0.0804; 0.8911]      18.6       18.8
# 13  0.1823 [-0.6830; 1.0476]       4.6        4.2
# 14  0.9808 [-0.4178; 2.3795]       1.3        1.6
# 15  1.3122 [-1.0055; 3.6299]       0.4        0.6
# 16 -0.2595 [-1.4450; 0.9260]       3.1        2.3
# 17  0.1384 [-0.5076; 0.7844]       8.5        7.6
#
# Number of studies combined: k = 17
#
#                       logOR           95%-CI    z  p-value
# Fixed effect model   0.4428 [0.2678; 0.6178] 4.96 < 0.0001
# Random effects model 0.4295 [0.2504; 0.6086] 4.70 < 0.0001
#
# Quantifying heterogeneity:
#  tau^2 = 0.0017 [0.0000; 0.4589]; tau = 0.0410 [0.0000; 0.6774];
#  I^2 = 1.1% [0.0%; 51.6%]; H = 1.01 [1.00; 1.44]
#
# Test of heterogeneity:
#      Q d.f. p-value
#  16.18   16  0.4404
#
# Details on meta-analytical method:
# - Mantel-Haenszel method
# - DerSimonian-Laird estimator for tau^2
# - Jackson method for confidence interval of tau^2 and tau
#
# > res_mb_hk$TE.fixed
# [1] 0.4428186730553189
# > res_mb_hk$seTE.fixed
# [1] 0.08928560091027186
# > c(res_mb_hk$lower.fixed, res_mb_hk$upper.fixed)
# [1] 0.2678221109331694 0.6178152351774684
#
# ```
#

print(st.summary())