From 2ec8f3b53c898ea7de206d1504f06c625728d424 Mon Sep 17 00:00:00 2001 From: statasaurus <55274484+statasaurus@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:13:50 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20PSIAIMS/?= =?UTF-8?q?CAMIS@0bca3eb4d850560a7db353b0665967c4060d81d1=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Comp/r-sas_cmh.html | 106 ++--- Comp/r-sas_survey-stats-summary.html | 106 ++--- R/PCA_analysis.html | 4 +- R/ancova.html | 636 +++++++++++++-------------- R/cmh.html | 106 ++--- R/gsd-tte.html | 212 ++++----- R/mi_mar_predictive_mean_match.html | 36 +- R/survey-stats-summary.html | 212 ++++----- SAS/survey-stats-summary.html | 212 ++++----- blogs/index.html | 6 +- index.html | 110 ++--- minutes/index.html | 38 +- python/MANOVA.html | 2 +- python/Rounding.html | 2 +- python/Summary_statistics.html | 2 +- python/ancova.html | 18 +- python/anova.html | 2 +- python/correlation.html | 6 +- python/kruskal_wallis.html | 4 +- python/linear_regression.html | 8 +- python/logistic_regression.html | 22 +- python/one_sample_t_test.html | 4 +- python/paired_t_test.html | 4 +- python/skewness_kurtosis.html | 6 +- python/two_samples_t_test.html | 6 +- search.json | 10 +- 26 files changed, 940 insertions(+), 940 deletions(-) diff --git a/Comp/r-sas_cmh.html b/Comp/r-sas_cmh.html index 671aec48..754a824f 100644 --- a/Comp/r-sas_cmh.html +++ b/Comp/r-sas_cmh.html @@ -348,23 +348,23 @@

CMH Statistics

As it can be seen, there are two schemata where R does not provide any results:

-
- diff --git a/Comp/r-sas_survey-stats-summary.html b/Comp/r-sas_survey-stats-summary.html index b319aaa4..28dcc45e 100644 --- a/Comp/r-sas_survey-stats-summary.html +++ b/Comp/r-sas_survey-stats-summary.html @@ -639,23 +639,23 @@

Quantiles

head(apisrs) |> gt::gt()
-
- diff --git a/R/PCA_analysis.html b/R/PCA_analysis.html index 77e42297..02d8e303 100644 --- a/R/PCA_analysis.html +++ b/R/PCA_analysis.html @@ -364,8 +364,8 @@

Vi scene = list(bgcolor = "lightgray")) fig

-
- +
+
diff --git a/R/ancova.html b/R/ancova.html index e293deab..6d812df5 100644 --- a/R/ancova.html +++ b/R/ancova.html @@ -272,23 +272,23 @@

The Model

model_tidy <- model_ancova %>% tidy() model_glance %>% gt()
-
- @@ -752,23 +752,23 @@

The Model

model_tidy   %>% gt()
-
- @@ -1246,23 +1246,23 @@

The Model

add_row(term = "Total", df = sum(.$df), sumsq = sum(.$sumsq)) model_table %>% gt()
-
- @@ -1745,23 +1745,23 @@

Type 1

get_anova_table() %>% gt()
-
- @@ -2238,23 +2238,23 @@

Type 2

get_anova_table() %>% gt()
-
- @@ -2731,23 +2731,23 @@

Type 3

get_anova_table() %>% gt()
-
- diff --git a/R/cmh.html b/R/cmh.html index 03624769..db36649d 100644 --- a/R/cmh.html +++ b/R/cmh.html @@ -240,23 +240,23 @@

Available R packages<

We did not find any R package that delivers all the same measures as SAS at once. Therefore, we tried out multiple packages:

-
- diff --git a/R/gsd-tte.html b/R/gsd-tte.html index 414e7172..5c841e65 100644 --- a/R/gsd-tte.html +++ b/R/gsd-tte.html @@ -438,23 +438,23 @@

Example using gsDe summary() |> as_gt()

-
- @@ -988,23 +988,23 @@

Example using gsDe summary() |> as_gt()

-
- diff --git a/R/mi_mar_predictive_mean_match.html b/R/mi_mar_predictive_mean_match.html index 3a77907e..4718a44c 100644 --- a/R/mi_mar_predictive_mean_match.html +++ b/R/mi_mar_predictive_mean_match.html @@ -368,15 +368,15 @@

Impute with PMM

imp_pmm$imp$bmi
      1    2    3    4    5
-1  33.2 22.7 27.2 30.1 27.4
-3  26.3 27.2 26.3 35.3 35.3
-4  28.7 22.5 27.4 22.7 22.7
-6  25.5 24.9 20.4 22.7 27.4
-10 22.5 22.0 22.5 28.7 22.0
-11 29.6 27.4 33.2 35.3 28.7
-12 22.0 25.5 27.5 20.4 28.7
-16 29.6 27.2 30.1 35.3 27.4
-21 27.2 30.1 27.2 25.5 22.5
+1 29.6 28.7 27.4 29.6 33.2 +3 27.2 30.1 35.3 30.1 22.0 +4 22.5 20.4 27.5 27.4 22.7 +6 20.4 27.4 21.7 22.5 21.7 +10 22.0 22.7 25.5 20.4 22.7 +11 27.4 27.2 29.6 35.3 30.1 +12 27.4 30.1 27.4 22.0 26.3 +16 30.1 27.2 30.1 27.4 24.9 +21 27.2 30.1 35.3 30.1 22.5

An alternative to the standard PMM is midastouch.

@@ -453,15 +453,15 @@

Impute with PMM

imp_pmms$imp$bmi
      1    2    3    4    5
-1  22.5 29.6 35.3 35.3 20.4
-3  30.1 29.6 27.5 30.1 22.0
-4  35.3 22.7 28.7 25.5 22.5
-6  35.3 22.7 24.9 25.5 22.5
-10 26.3 22.0 35.3 22.7 26.3
-11 22.5 22.5 29.6 30.1 20.4
-12 26.3 22.0 27.4 27.4 26.3
-16 22.5 22.5 29.6 30.1 22.0
-21 22.5 22.5 35.3 30.1 20.4
+1 30.1 28.7 30.1 20.4 28.7 +3 30.1 33.2 30.1 30.1 28.7 +4 24.9 22.5 25.5 22.5 25.5 +6 24.9 22.5 25.5 21.7 27.4 +10 30.1 27.4 22.7 26.3 28.7 +11 30.1 28.7 30.1 30.1 24.9 +12 30.1 27.4 22.0 26.3 28.7 +16 30.1 28.7 20.4 28.7 27.5 +21 33.2 22.5 28.7 20.4 27.5
diff --git a/R/survey-stats-summary.html b/R/survey-stats-summary.html index 8dd70919..729d05fe 100644 --- a/R/survey-stats-summary.html +++ b/R/survey-stats-summary.html @@ -267,23 +267,23 @@

Simple Survey Designs

head(apisrs) |> gt::gt()
-
- @@ -1127,23 +1127,23 @@

Summary Statistics on Complex Survey Designs

head(nhanes) |> gt::gt()
-
- diff --git a/SAS/survey-stats-summary.html b/SAS/survey-stats-summary.html index 476174dc..3aaaac24 100644 --- a/SAS/survey-stats-summary.html +++ b/SAS/survey-stats-summary.html @@ -262,23 +262,23 @@

Simple Survey Designs

We will use the API dataset (“API Data Files” 2006), which contains a number of datasets based on different samples from a dataset of academic performance. Initially we will just cover the methodology with a simple random sample and a finite population correction to demonstrate functionality.

-
- @@ -1134,23 +1134,23 @@

Summary Statistics on Complex Survey Designs

Much of the previous examples and notes still stand for more complex survey designs, here we will demonstrate using a dataset from NHANES (“National Health and Nutrition Examination Survey Data” 2010), which uses both stratification and clustering:

-
- diff --git a/blogs/index.html b/blogs/index.html index 1aef0fa5..f78289b8 100644 --- a/blogs/index.html +++ b/blogs/index.html @@ -243,7 +243,7 @@

Blogs

-
+
-
+
-
+

diff --git a/index.html b/index.html index 22bb2953..d5ba5901 100644 --- a/index.html +++ b/index.html @@ -203,23 +203,23 @@

Repository

The repository below provides examples of statistical methodology in different software and languages, along with a comparison of the results obtained and description of any discrepancies.

-
- @@ -955,10 +955,10 @@

Repository

Group-sequential designs - +
R - +East Multivariate methods diff --git a/minutes/index.html b/minutes/index.html index 1dc7cdfe..f627535a 100644 --- a/minutes/index.html +++ b/minutes/index.html @@ -247,7 +247,7 @@

Meeting Minutes

-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+

 
diff --git a/python/MANOVA.html b/python/MANOVA.html index 8a8b07a0..9ed084ee 100644 --- a/python/MANOVA.html +++ b/python/MANOVA.html @@ -228,7 +228,7 @@

MANOVA in Python

Example 39.6 Multivariate Analysis of Variance from SAS MANOVA User Guide

This example employs multivariate analysis of variance (MANOVA) to measure differences in the chemical characteristics of ancient pottery found at four kiln sites in Great Britain. The data are from Tubb, Parker, and Nickless (1980), as reported in Hand et al. (1994).

For each of 26 samples of pottery, the percentages of oxides of five metals are measured. The following statements create the data set and invoke the GLM procedure to perform a one-way MANOVA. Additionally, it is of interest to know whether the pottery from one site in Wales (Llanederyn) differs from the samples from other sites; a CONTRAST statement is used to test this hypothesis.

-
+
import pandas as pd
 from statsmodels.multivariate.manova import MANOVA
 
diff --git a/python/Rounding.html b/python/Rounding.html
index ddf38937..7d8d70d4 100644
--- a/python/Rounding.html
+++ b/python/Rounding.html
@@ -217,7 +217,7 @@ 

Rounding in Python

Python has a built-in round() function that takes two numeric arguments, number and ndigits, and returns a floating point number that is a rounded version of the number up to the specified number of decimals.

The default number of decimal is 0, meaning that the function will return the nearest integer.

-
+
# For integers
 x= 12
 print(round(x))
diff --git a/python/Summary_statistics.html b/python/Summary_statistics.html
index 12974f34..e34f33bc 100644
--- a/python/Summary_statistics.html
+++ b/python/Summary_statistics.html
@@ -222,7 +222,7 @@ 

Summary statistics

4.out (optional): An alternate output array where we can place the result.

5.overwrite_input (optional): Used to modify the input array.

6.keepdims (optional): Creates reduced axes with dimensions of one size.

-
+
import numpy as np
 
 sample_data=[12, 25, 16, 50, 34, 29, 60, 86, 52, 39, 41]
diff --git a/python/ancova.html b/python/ancova.html
index 9e6c4049..0b844735 100644
--- a/python/ancova.html
+++ b/python/ancova.html
@@ -232,7 +232,7 @@ 

Introduction

Data Summary

-
+
import pandas as pd
 
 # Input data
@@ -250,7 +250,7 @@ 

Data Summary

df = pd.DataFrame(data)
-
+
# Descriptive statistics
 summary_stats = df.describe()
 
@@ -279,7 +279,7 @@ 

Data Summary

Ancova in Python

In Python, Ancova can be performed using the statsmodels library from the scipy package.

-
+
import statsmodels.api as sm
 import statsmodels.formula.api as smf
 from tabulate import tabulate
@@ -313,7 +313,7 @@ 

Ancova in Python

Model: OLS Adj. R-squared: 0.639 Method: Least Squares F-statistic: 18.10 Date: Tue, 06 Aug 2024 Prob (F-statistic): 1.50e-06 -Time: 15:06:24 Log-Likelihood: -82.054 +Time: 15:09:28 Log-Likelihood: -82.054 No. Observations: 30 AIC: 172.1 Df Residuals: 26 BIC: 177.7 Df Model: 3 @@ -354,7 +354,7 @@

Ancova in Python

Please note that all values match with the corresponding R version, except for the AIC and BIC values, which differ slightly. This should be acceptable for most practical purposes in statistical analysis. Currently, there are ongoing discussions in the statsmodels community regarding the computational details of AIC and BIC.

The following code can be used to enforce complete consistency of AIC and BIC values with R outputs by adding 1 to the number of parameters:

-
+
import numpy as np
 
 # Manual calculation of AIC and BIC to ensure consistency with R
@@ -383,7 +383,7 @@ 

Ancova in Python

There are different types of anova computations. The statsmodels.stats.anova.anova_lm function allows the types 1, 2 and 3. The code to compute these types is depicted below:

-
+
import statsmodels.formula.api as smf
 import statsmodels.stats.anova as ssa
 
@@ -455,7 +455,7 @@ 

Ancova in Python

Type 1 Ancova in Python

-
+
print(tabulate(ancova_table_type_1, headers='keys', tablefmt='grid'))
+----+----------+-------+-------+---------+---------+----------+-------------+----------+----------+
@@ -470,7 +470,7 @@ 

Type 1 Ancova in P

Type 2 Ancova in Python

-
+
print(tabulate(ancova_table_type_2, headers='keys', tablefmt='grid'))
+----+----------+-------+-------+----------+---------+----------+-------------+----------+----------+
@@ -485,7 +485,7 @@ 

Type 2 Ancova in P

Type 3 Ancova in Python

-
+
print(tabulate(ancova_table_type_3, headers='keys', tablefmt='grid'))
+----+-----------+-------+-------+----------+---------+----------+-------------+----------+----------+
diff --git a/python/anova.html b/python/anova.html
index e16962ad..db559c7b 100644
--- a/python/anova.html
+++ b/python/anova.html
@@ -233,7 +233,7 @@ 

Introduction

Anova Test in Python

To perform a one-way ANOVA test in Python we can use the f_oneway() function from SciPy library. Similarly, to perform two-way ANOVA test anova_lm() function from the statsmodel library is frequently used.

For this test, we’ll create a data frame called df_disease taken from the SAS documentation. The corresponding data can be found here. In this experiment, we are trying to find the impact of different drug and disease group on the stem-length

-
+
import pandas as pd
 import statsmodels.api as sm
 from statsmodels.formula.api import ols
diff --git a/python/correlation.html b/python/correlation.html
index 326ad5d7..273c68f7 100644
--- a/python/correlation.html
+++ b/python/correlation.html
@@ -228,7 +228,7 @@ 

Correlation Analysis in Python

Pearson’s Correlation

It is a parametric correlation test because it depends on the distribution of data. It measures the linear dependence between two variables x and y. It is the ratio between the covariance of two variables and the product of their standard deviation. The result always have a value between 1 and -1.

-
+
import pandas as pd
 from scipy.stats import pearsonr
 
@@ -250,7 +250,7 @@ 

Pearson’s Correlati

Kendall’s Rank

A τ test is a non-parametric hypothesis test for statistical dependence based on the τ coefficient. It is a measure of rank correlation: the similarity of the orderings of the data when ranked by each of the quantities. The Kendall correlation between two variables will be high when observations have a similar (or identical for a correlation of 1) rank (i.e. relative position label of the observations within the variable: 1st, 2nd, 3rd, etc.) between the two variables, and low when observations have a dissimilar (or fully different for a correlation of −1) rank between the two variables.

-
+
import pandas as pd
 from scipy.stats import kendalltau 
 
@@ -273,7 +273,7 @@ 

Kendall’s Rank

Spearman’s Rank

Spearman’s Rank Correlation is a statistical measure of the strength and direction of the monotonic relationship between two continuous variables. Therefore, these attributes are ranked or put in the order of their preference. It is denoted by the symbol “rho” (ρ) and can take values between -1 to +1. A positive value of rho indicates that there exists a positive relationship between the two variables, while a negative value of rho indicates a negative relationship. A rho value of 0 indicates no association between the two variables.

-
+
import pandas as pd
 from scipy.stats import spearmanr
  
diff --git a/python/kruskal_wallis.html b/python/kruskal_wallis.html
index 0a04d9c5..54662f1e 100644
--- a/python/kruskal_wallis.html
+++ b/python/kruskal_wallis.html
@@ -226,7 +226,7 @@ 

Kruskal Wallis in Python

Introduction

The Kruskal-Wallis test is a non-parametric equivalent to the one-way ANOVA. For this example, the data used is a subset of the iris dataset, testing for difference in sepal width between species of flower.

-
+
import pandas as pd
 
 # Define the data
@@ -266,7 +266,7 @@ 

Introduction

Implementing Kruskal-Wallis in Python

The Kruskal-Wallis test can be implemented in Python using the kruskal function from scipy.stats. The null hypothesis is that the samples are from identical populations.

-
+
from scipy.stats import kruskal
 
 # Separate the data for each species
diff --git a/python/linear_regression.html b/python/linear_regression.html
index 0b430f1b..d51bd6b0 100644
--- a/python/linear_regression.html
+++ b/python/linear_regression.html
@@ -229,7 +229,7 @@ 

Linear Regression

Descriptive Statistics

The first step is to obtain the simple descriptive statistics for the numeric variables of htwt data, and one-way frequencies for categorical variables. This is accomplished by employing summary function. There are 237 participants who are from 13.9 to 25 years old. It is a cross-sectional study, with each participant having one observation. We can use this data set to examine the relationship of participants’ height to their age and sex.

-
+
import pandas as pd
 import statsmodels.api as sm
 
@@ -237,7 +237,7 @@ 

Descriptive Statist htwt = pd.read_csv("../data/htwt.csv")

In order to create a regression model to demonstrate the relationship between age and height for females, we first need to create a flag variable identifying females and an interaction variable between age and female gender flag.

-
+
htwt['female'] = (htwt['SEX'] == 'f').astype(int)
 htwt['fem_age'] = htwt['AGE'] * htwt['female']
 htwt.head()
@@ -319,7 +319,7 @@

Descriptive Statist

Regression Analysis

Next, we fit a regression model, representing the relationships between gender, age, height and the interaction variable created in the datastep above. We again use a where statement to restrict the analysis to those who are less than or equal to 19 years old. We use the clb option to get a 95% confidence interval for each of the parameters in the model. The model that we are fitting is height = b0 + b1 x female + b2 x age + b3 x fem_age + e

-
+
X = htwt[['female', 'AGE', 'fem_age']][htwt['AGE'] <= 19]
 X = sm.add_constant(X)
 Y = htwt['HEIGHT'][htwt['AGE'] <= 19]
@@ -357,7 +357,7 @@ 

Regression Analysis Time: -15:06:27 +15:09:31 Log-Likelihood: -534.17 diff --git a/python/logistic_regression.html b/python/logistic_regression.html index 1ea91895..0e494695 100644 --- a/python/logistic_regression.html +++ b/python/logistic_regression.html @@ -267,7 +267,7 @@

Logistic Regression

Imports

-
+
#data manipulation
 import pandas as pd
 import numpy as np
@@ -289,7 +289,7 @@ 

Background

Example : Lung cancer data

Data source: Loprinzi CL. Laurie JA. Wieand HS. Krook JE. Novotny PJ. Kugler JW. Bartel J. Law M. Bateman M. Klatt NE. et al. Prospective evaluation of prognostic variables from patient-completed questionnaires. North Central Cancer Treatment Group. Journal of Clinical Oncology. 12(3):601-7, 1994.

These data were sourced from the R package {survival} and have been downloaded and stored in the data folder.

-
+
# importing and prepare
 lung2 = pd.read_csv("../data/lung_cancer.csv")
 
@@ -302,7 +302,7 @@ 

Example : Lung cancer data

Logistic Regression Modelling

Let’s further prepare our data for modelling by selecting the explanatory variables and the dependent variable. The Python packages that we are are aware of require complete (i.e. no missing values) data so for convenience of demonstrating these methods we will drop rows with missing values.

-
+
x_vars = ["age", "sex", "ph.ecog", "meal.cal"]
 y_var = "wt_grp"
 
@@ -316,7 +316,7 @@ 

Logistic Regression Modelling

Statsmodels package

We will use the sm.Logit() method to fit our logistic regression model.

-
+
#intercept column
 x_sm = sm.add_constant(x)
 
@@ -333,7 +333,7 @@ 

Statsmodels packageStatsmodels package

Model fitting

In addition to the information contained in the summary, we can display the model coefficients as odds ratios:

-
+
print("Odds ratios for statsmodels logistic regression:")
 print(np.exp(lr_sm.params))
@@ -364,7 +364,7 @@

Model fitting

We can also provide the 5% confidence intervals for the odds ratios:

-
+
print("CI at 5% for statsmodels logistic regression:")
 print(np.exp(lr_sm.conf_int(alpha = 0.05)))
@@ -381,7 +381,7 @@

Model fitting

Prediction

Let’s use our trained model to make a weight loss prediction about a new patient.

-
+
# new female, symptomatic but completely ambulatory patient consuming 2500 calories
 new_pt = pd.DataFrame({
     "age": [56],
@@ -419,11 +419,11 @@ 

Scikit-learn Package<

It’s important to note that l2 regularisation is applied by default in the scikit-learn implementation of logistic regression. More recent releases of this package include an option to have no regularisation penalty.

-
+
lr_sk = LogisticRegression(penalty=None).fit(x, y)

Unlike the statsmodels approach scikit-learn doesn’t have a summary method for the model but you can extract some of the model parameters as follows:

-
+
print("Intercept for scikit learn logistic regression:")
 print(lr_sk.intercept_)
 print("Odds ratios for scikit learn logistic regression:")
@@ -439,7 +439,7 @@ 

Scikit-learn Package<

Prediction

Using the same new patient example we can use our logistic regression model to make a prediction. The predict_proba method is used to return the probability for each class. If you are interested in viewing the prediction for y = 1, i.e. the probability of weight loss then you can select the second probability as shown:

-
+
print("Probability of weight loss using the scikit-learn package:")
 print(lr_sk.predict_proba(new_pt)[:,1])
diff --git a/python/one_sample_t_test.html b/python/one_sample_t_test.html index 6b953118..e069c28e 100644 --- a/python/one_sample_t_test.html +++ b/python/one_sample_t_test.html @@ -233,7 +233,7 @@

One Sample t-t

Data Used

-
+
import pandas as pd
 
 # Create sample data
@@ -248,7 +248,7 @@ 

Data Used

subsubtitle: “t-test”


The following code was used to test the comparison in Python. Note that the baseline null hypothesis goes in the “popmean” parameter.

-
+
import pandas as pd
 from scipy import stats
 
diff --git a/python/paired_t_test.html b/python/paired_t_test.html
index 89cbf40f..a32c836c 100644
--- a/python/paired_t_test.html
+++ b/python/paired_t_test.html
@@ -235,7 +235,7 @@ 

Paired t-test in P

Data Used

-
+
import pandas as pd
 
 # Create sample data
@@ -250,7 +250,7 @@ 

Data Used

Paired t-test

The following code was used to test the comparison in Python. Note that the baseline null hypothesis goes in the “popmean” parameter.

-
+
import pandas as pd
 from scipy import stats
 
diff --git a/python/skewness_kurtosis.html b/python/skewness_kurtosis.html
index afed08c0..0f54649b 100644
--- a/python/skewness_kurtosis.html
+++ b/python/skewness_kurtosis.html
@@ -258,7 +258,7 @@ 

Skewness and Kurtosis in Python

Skewness measures the the amount of asymmetry in a distribution, while Kurtosis describes the “tailedness” of the curve. These measures are frequently used to assess the normality of the data. There are several methods to calculate these measures. In Python, the packages pandas, scipy.stats.skew and scipy.stats.kurtosis can be used.

Data Used

-
+
import pandas as pd
 from scipy.stats import skew, kurtosis
 
@@ -288,7 +288,7 @@ 

Skewness

\]

All three skewness measures are unbiased under normality. The three methods are illustrated in the following code:

-
+
# Skewness
 type1_skew = skew(df['points'])
 type2_skew = df['points'].skew()
@@ -319,7 +319,7 @@ 

Kurtosis

\[b_2 = m_4/s^4-3 = (g_2 + 3)(1-1/n)^2-3\]

Only \(G_2\) (corresponding to type 2) is unbiased under normality. The three methods are illustrated in the following code:

-
+
# Kurtosis
 type1_kurt = kurtosis(df['points'])
 
diff --git a/python/two_samples_t_test.html b/python/two_samples_t_test.html
index 2010bb2f..909771c5 100644
--- a/python/two_samples_t_test.html
+++ b/python/two_samples_t_test.html
@@ -235,7 +235,7 @@ 

Two Sample t-test in Python

Data Used

The following data was used in this example.

-
+
import pandas as pd
 import numpy as np
 from scipy import stats
@@ -257,7 +257,7 @@ 

Student’s T-Test

Code

The following code was used to test the comparison in Python. Note that we must separate the single variable into two variables to satisfy the scipy stats package syntax.

-
+
# Separate data into two groups
 group1 = df[df['trt_grp'] == 'placebo']['WtGain']
 group2 = df[df['trt_grp'] == 'treatment']['WtGain']
@@ -281,7 +281,7 @@ 

Welch’s T-Test

Code

The following code was used to test the comparison in Python using Welch’s t-test.

-
+
# Perform Welch's t-test assuming unequal variances
 t_stat_welch, p_value_unequal_var = stats.ttest_ind(group1, group2, equal_var=False)
 
diff --git a/search.json b/search.json
index ed98a5b7..f2cd61f0 100644
--- a/search.json
+++ b/search.json
@@ -508,7 +508,7 @@
     "href": "index.html",
     "title": "CAMIS - A PHUSE DVOST Working Group",
     "section": "",
-    "text": "Introduction\nSeveral discrepancies have been discovered in statistical analysis results between different programming languages, even in fully qualified statistical computing environments. Subtle differences exist between the fundamental approaches implemented by each language, yielding differences in results which are each correct in their own right. The fact that these differences exist causes unease on the behalf of sponsor companies when submitting to a regulatory agency, as it is uncertain if the agency will view these differences as problematic. In its Statistical Software Clarifying Statement, the US Food and Drug Administration (FDA) states that it “FDA does not require use of any specific software for statistical analyses” and that “the computer software used for data management and statistical analysis should be reliable.” Observing differences across languages can reduce the analyst’s confidence in reliability and, by understanding the source of any discrepancies, one can reinstate confidence in reliability.\n\nMotivation\nThe goal of this project is to demystify conflicting results between software and to help ease the transitions to new languages by providing comparison and comprehensive explanations.\n\n\nRepository\nThe repository below provides examples of statistical methodology in different software and languages, along with a comparison of the results obtained and description of any discrepancies.\n\n\n\n\n\n\n\n\nMethods\nR\nSAS\nPython\nComparison\n\n\n\n\nSummary Statistics\nRounding\nR\nSAS\nPython\nR vs SAS\n\n\nSummary statistics\nR\nSAS\nPython\nR vs SAS\n\n\nSkewness/Kurtosis\nR\nSAS\nPython\nR vs SAS\n\n\nGeneral Linear Models\nOne Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nPaired t-test\nR\nSAS\nPython\nR vs SAS\n\n\nTwo Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nANCOVA\nR\nSAS\nPython\nR vs SAS\n\n\nMANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nLinear Regression\nR\nSAS\nPython\nR vs SAS\n\n\nGeneralized Linear Models\nLogistic Regression\nR\nSAS\nPython\nR vs SAS\n\n\nPoisson/Negative Binomial Regression\nR\n\n\nR vs SAS\n\n\nCategorical Repeated Measures\n\n\n\n\n\n\nCategorical Multiple Imputation\n\n\n\n\n\n\nNon-parametric Analysis\nWilcoxon signed rank\nR\n\n\n\n\n\nMann-Whitney U/Wilcoxon rank sum\nR\n\n\n\n\n\nKolmogorov-Smirnov test\n\n\n\n\n\n\nKruskall-Wallis test\nR\nSAS\nPython\nR vs SAS\n\n\nFriedman test\n\n\n\n\n\n\nJonckheere test\n\n\n\n\n\n\nHodges-Lehman Estimator\nR\nSAS\n\n\n\n\nCategorical Data Analysis\nBinomial test\nR\n\n\n\n\n\nMcNemar's test\nR\nSAS\n\nR vs SAS\n\n\nChi-Square Association/Fishers exact\nR\nSAS\n\nR vs SAS\n\n\nCochran Mantel Haenszel\nR\nSAS\n\nR vs SAS\n\n\nConfidence Intervals for proportions\nR\n\n\n\n\n\nRepeated Measures\nLinear Mixed Model (MMRM)\nR\nSAS\n\nR vs SAS\n\n\nGeneralized Linear Mixed Model (MMRM)\n\n\n\n\n\n\nBayesian MMRM\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MAR\nMCMC\n\n\n\n\n\n\nLinear regression\nR\n\n\n\n\n\nPredictive Mean Matching\nR\n\n\n\n\n\nPropensity Scores\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MNAR\nDelta Adjustment/Tipping Point\n\n\n\n\n\n\nReference-Based Imputation/Sequential Methods\n\n\n\n\n\n\nReference-Based Imputation/Joint Modelling\n\n\n\n\n\n\nCorrelation\nPearson's/ Spearman's/ Kendall's Rank\nR\nSAS\nPython\nR vs SAS\n\n\nSurvival Models\nKaplan-Meier Log-rank test and Cox-PH\nR\nSAS\n\nR vs SAS\n\n\nAccelerated Failure Time\nR\n\n\n\n\n\nNon-proportional hazards methods\n\n\n\n\n\n\nSample size /Power calculations\nSingle timepoint analysis\n\n\n\n\n\n\nGroup-sequential designs\n\n\n\n\n\n\nMultivariate methods\nClustering\n\n\n\n\n\n\nFactor analysis\n\n\n\n\n\n\nPCA\nR\n\n\n\n\n\nCanonical correlation\n\n\n\n\n\n\nPLS\n\n\n\n\n\n\nOther Methods\nSurvey statistics\nR\nSAS\n\nR vs SAS\n\n\nNearest neighbour\n\n\n\n\n\n\nCausal inference\n\n\n\n\n\n\nMachine learning"
+    "text": "Introduction\nSeveral discrepancies have been discovered in statistical analysis results between different programming languages, even in fully qualified statistical computing environments. Subtle differences exist between the fundamental approaches implemented by each language, yielding differences in results which are each correct in their own right. The fact that these differences exist causes unease on the behalf of sponsor companies when submitting to a regulatory agency, as it is uncertain if the agency will view these differences as problematic. In its Statistical Software Clarifying Statement, the US Food and Drug Administration (FDA) states that it “FDA does not require use of any specific software for statistical analyses” and that “the computer software used for data management and statistical analysis should be reliable.” Observing differences across languages can reduce the analyst’s confidence in reliability and, by understanding the source of any discrepancies, one can reinstate confidence in reliability.\n\nMotivation\nThe goal of this project is to demystify conflicting results between software and to help ease the transitions to new languages by providing comparison and comprehensive explanations.\n\n\nRepository\nThe repository below provides examples of statistical methodology in different software and languages, along with a comparison of the results obtained and description of any discrepancies.\n\n\n\n\n\n\n\n\nMethods\nR\nSAS\nPython\nComparison\n\n\n\n\nSummary Statistics\nRounding\nR\nSAS\nPython\nR vs SAS\n\n\nSummary statistics\nR\nSAS\nPython\nR vs SAS\n\n\nSkewness/Kurtosis\nR\nSAS\nPython\nR vs SAS\n\n\nGeneral Linear Models\nOne Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nPaired t-test\nR\nSAS\nPython\nR vs SAS\n\n\nTwo Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nANCOVA\nR\nSAS\nPython\nR vs SAS\n\n\nMANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nLinear Regression\nR\nSAS\nPython\nR vs SAS\n\n\nGeneralized Linear Models\nLogistic Regression\nR\nSAS\nPython\nR vs SAS\n\n\nPoisson/Negative Binomial Regression\nR\n\n\nR vs SAS\n\n\nCategorical Repeated Measures\n\n\n\n\n\n\nCategorical Multiple Imputation\n\n\n\n\n\n\nNon-parametric Analysis\nWilcoxon signed rank\nR\n\n\n\n\n\nMann-Whitney U/Wilcoxon rank sum\nR\n\n\n\n\n\nKolmogorov-Smirnov test\n\n\n\n\n\n\nKruskall-Wallis test\nR\nSAS\nPython\nR vs SAS\n\n\nFriedman test\n\n\n\n\n\n\nJonckheere test\n\n\n\n\n\n\nHodges-Lehman Estimator\nR\nSAS\n\n\n\n\nCategorical Data Analysis\nBinomial test\nR\n\n\n\n\n\nMcNemar's test\nR\nSAS\n\nR vs SAS\n\n\nChi-Square Association/Fishers exact\nR\nSAS\n\nR vs SAS\n\n\nCochran Mantel Haenszel\nR\nSAS\n\nR vs SAS\n\n\nConfidence Intervals for proportions\nR\n\n\n\n\n\nRepeated Measures\nLinear Mixed Model (MMRM)\nR\nSAS\n\nR vs SAS\n\n\nGeneralized Linear Mixed Model (MMRM)\n\n\n\n\n\n\nBayesian MMRM\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MAR\nMCMC\n\n\n\n\n\n\nLinear regression\nR\n\n\n\n\n\nPredictive Mean Matching\nR\n\n\n\n\n\nPropensity Scores\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MNAR\nDelta Adjustment/Tipping Point\n\n\n\n\n\n\nReference-Based Imputation/Sequential Methods\n\n\n\n\n\n\nReference-Based Imputation/Joint Modelling\n\n\n\n\n\n\nCorrelation\nPearson's/ Spearman's/ Kendall's Rank\nR\nSAS\nPython\nR vs SAS\n\n\nSurvival Models\nKaplan-Meier Log-rank test and Cox-PH\nR\nSAS\n\nR vs SAS\n\n\nAccelerated Failure Time\nR\n\n\n\n\n\nNon-proportional hazards methods\n\n\n\n\n\n\nSample size /Power calculations\nSingle timepoint analysis\n\n\n\n\n\n\nGroup-sequential designs\nR\n\n\nEast\n\n\nMultivariate methods\nClustering\n\n\n\n\n\n\nFactor analysis\n\n\n\n\n\n\nPCA\nR\n\n\n\n\n\nCanonical correlation\n\n\n\n\n\n\nPLS\n\n\n\n\n\n\nOther Methods\nSurvey statistics\nR\nSAS\n\nR vs SAS\n\n\nNearest neighbour\n\n\n\n\n\n\nCausal inference\n\n\n\n\n\n\nMachine learning"
   },
   {
     "objectID": "contribution/contribution.html",
@@ -557,7 +557,7 @@
     "href": "python/linear_regression.html",
     "title": "Linear Regression",
     "section": "",
-    "text": "To demonstrate the use of linear regression we examine a dataset that illustrates the relationship between Height and Weight in a group of 237 teen-aged boys and girls. The dataset is available here and is imported to the workspace.\n\nDescriptive Statistics\nThe first step is to obtain the simple descriptive statistics for the numeric variables of htwt data, and one-way frequencies for categorical variables. This is accomplished by employing summary function. There are 237 participants who are from 13.9 to 25 years old. It is a cross-sectional study, with each participant having one observation. We can use this data set to examine the relationship of participants’ height to their age and sex.\n\nimport pandas as pd\nimport statsmodels.api as sm\n\n# Importing CSV\nhtwt = pd.read_csv(\"../data/htwt.csv\")\n\nIn order to create a regression model to demonstrate the relationship between age and height for females, we first need to create a flag variable identifying females and an interaction variable between age and female gender flag.\n\nhtwt['female'] = (htwt['SEX'] == 'f').astype(int)\nhtwt['fem_age'] = htwt['AGE'] * htwt['female']\nhtwt.head()\n\n\n\n\n\n\n\n\nROW\nSEX\nAGE\nHEIGHT\nWEIGHT\nfemale\nfem_age\n\n\n\n\n0\n1\nf\n14.3\n56.3\n85.0\n1\n14.3\n\n\n1\n2\nf\n15.5\n62.3\n105.0\n1\n15.5\n\n\n2\n3\nf\n15.3\n63.3\n108.0\n1\n15.3\n\n\n3\n4\nf\n16.1\n59.0\n92.0\n1\n16.1\n\n\n4\n5\nf\n19.1\n62.5\n112.5\n1\n19.1\n\n\n\n\n\n\n\n\n\nRegression Analysis\nNext, we fit a regression model, representing the relationships between gender, age, height and the interaction variable created in the datastep above. We again use a where statement to restrict the analysis to those who are less than or equal to 19 years old. We use the clb option to get a 95% confidence interval for each of the parameters in the model. The model that we are fitting is height = b0 + b1 x female + b2 x age + b3 x fem_age + e\n\nX = htwt[['female', 'AGE', 'fem_age']][htwt['AGE'] <= 19]\nX = sm.add_constant(X)\nY = htwt['HEIGHT'][htwt['AGE'] <= 19]\n\nmodel = sm.OLS(Y, X).fit()\n\nmodel.summary()\n\n\nOLS Regression Results\n\n\nDep. Variable:\nHEIGHT\nR-squared:\n0.460\n\n\nModel:\nOLS\nAdj. R-squared:\n0.452\n\n\nMethod:\nLeast Squares\nF-statistic:\n60.93\n\n\nDate:\nTue, 06 Aug 2024\nProb (F-statistic):\n1.50e-28\n\n\nTime:\n15:06:27\nLog-Likelihood:\n-534.17\n\n\nNo. Observations:\n219\nAIC:\n1076.\n\n\nDf Residuals:\n215\nBIC:\n1090.\n\n\nDf Model:\n3\n\n\n\n\nCovariance Type:\nnonrobust\n\n\n\n\n\n\n\n\n\n\n\ncoef\nstd err\nt\nP>|t|\n[0.025\n0.975]\n\n\nconst\n28.8828\n2.873\n10.052\n0.000\n23.219\n34.547\n\n\nfemale\n13.6123\n4.019\n3.387\n0.001\n5.690\n21.534\n\n\nAGE\n2.0313\n0.178\n11.435\n0.000\n1.681\n2.381\n\n\nfem_age\n-0.9294\n0.248\n-3.750\n0.000\n-1.418\n-0.441\n\n\n\n\n\n\n\n\nOmnibus:\n1.300\nDurbin-Watson:\n2.284\n\n\nProb(Omnibus):\n0.522\nJarque-Bera (JB):\n0.981\n\n\nSkew:\n-0.133\nProb(JB):\n0.612\n\n\nKurtosis:\n3.191\nCond. No.\n450.\n\n\n\nNotes:[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n\n\nFrom the coefficients table b0,b1,b2,b3 are estimated as b0=28.88 b1=13.61 b2=2.03 b3=-0.92942\nThe resulting regression model for height, age and gender based on the available data is height=28.8828 + 13.6123 x female + 2.0313 x age -0.9294 x fem_age"
+    "text": "To demonstrate the use of linear regression we examine a dataset that illustrates the relationship between Height and Weight in a group of 237 teen-aged boys and girls. The dataset is available here and is imported to the workspace.\n\nDescriptive Statistics\nThe first step is to obtain the simple descriptive statistics for the numeric variables of htwt data, and one-way frequencies for categorical variables. This is accomplished by employing summary function. There are 237 participants who are from 13.9 to 25 years old. It is a cross-sectional study, with each participant having one observation. We can use this data set to examine the relationship of participants’ height to their age and sex.\n\nimport pandas as pd\nimport statsmodels.api as sm\n\n# Importing CSV\nhtwt = pd.read_csv(\"../data/htwt.csv\")\n\nIn order to create a regression model to demonstrate the relationship between age and height for females, we first need to create a flag variable identifying females and an interaction variable between age and female gender flag.\n\nhtwt['female'] = (htwt['SEX'] == 'f').astype(int)\nhtwt['fem_age'] = htwt['AGE'] * htwt['female']\nhtwt.head()\n\n\n\n\n\n\n\n\nROW\nSEX\nAGE\nHEIGHT\nWEIGHT\nfemale\nfem_age\n\n\n\n\n0\n1\nf\n14.3\n56.3\n85.0\n1\n14.3\n\n\n1\n2\nf\n15.5\n62.3\n105.0\n1\n15.5\n\n\n2\n3\nf\n15.3\n63.3\n108.0\n1\n15.3\n\n\n3\n4\nf\n16.1\n59.0\n92.0\n1\n16.1\n\n\n4\n5\nf\n19.1\n62.5\n112.5\n1\n19.1\n\n\n\n\n\n\n\n\n\nRegression Analysis\nNext, we fit a regression model, representing the relationships between gender, age, height and the interaction variable created in the datastep above. We again use a where statement to restrict the analysis to those who are less than or equal to 19 years old. We use the clb option to get a 95% confidence interval for each of the parameters in the model. The model that we are fitting is height = b0 + b1 x female + b2 x age + b3 x fem_age + e\n\nX = htwt[['female', 'AGE', 'fem_age']][htwt['AGE'] <= 19]\nX = sm.add_constant(X)\nY = htwt['HEIGHT'][htwt['AGE'] <= 19]\n\nmodel = sm.OLS(Y, X).fit()\n\nmodel.summary()\n\n\nOLS Regression Results\n\n\nDep. Variable:\nHEIGHT\nR-squared:\n0.460\n\n\nModel:\nOLS\nAdj. R-squared:\n0.452\n\n\nMethod:\nLeast Squares\nF-statistic:\n60.93\n\n\nDate:\nTue, 06 Aug 2024\nProb (F-statistic):\n1.50e-28\n\n\nTime:\n15:09:31\nLog-Likelihood:\n-534.17\n\n\nNo. Observations:\n219\nAIC:\n1076.\n\n\nDf Residuals:\n215\nBIC:\n1090.\n\n\nDf Model:\n3\n\n\n\n\nCovariance Type:\nnonrobust\n\n\n\n\n\n\n\n\n\n\n\ncoef\nstd err\nt\nP>|t|\n[0.025\n0.975]\n\n\nconst\n28.8828\n2.873\n10.052\n0.000\n23.219\n34.547\n\n\nfemale\n13.6123\n4.019\n3.387\n0.001\n5.690\n21.534\n\n\nAGE\n2.0313\n0.178\n11.435\n0.000\n1.681\n2.381\n\n\nfem_age\n-0.9294\n0.248\n-3.750\n0.000\n-1.418\n-0.441\n\n\n\n\n\n\n\n\nOmnibus:\n1.300\nDurbin-Watson:\n2.284\n\n\nProb(Omnibus):\n0.522\nJarque-Bera (JB):\n0.981\n\n\nSkew:\n-0.133\nProb(JB):\n0.612\n\n\nKurtosis:\n3.191\nCond. No.\n450.\n\n\n\nNotes:[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n\n\nFrom the coefficients table b0,b1,b2,b3 are estimated as b0=28.88 b1=13.61 b2=2.03 b3=-0.92942\nThe resulting regression model for height, age and gender based on the available data is height=28.8828 + 13.6123 x female + 2.0313 x age -0.9294 x fem_age"
   },
   {
     "objectID": "python/kruskal_wallis.html",
@@ -1152,7 +1152,7 @@
     "href": "R/mi_mar_predictive_mean_match.html#example",
     "title": "Multiple Imputation: Predictive Mean Matching",
     "section": "Example",
-    "text": "Example\nWe use the small dataset nhanes included in mice package. It has 25 rows, and three out of four variables have missings.\nThe original NHANES data is a large national level survey, some are publicly available via R package nhanes.\n\nlibrary(mice)\n\nWarning in check_dep_version(): ABI version mismatch: \nlme4 was built with Matrix ABI version 1\nCurrent Matrix ABI version is 2\nPlease re-install lme4 from source or restore original 'Matrix' package\n\n\n\nAttaching package: 'mice'\n\n\nThe following object is masked from 'package:stats':\n\n    filter\n\n\nThe following objects are masked from 'package:base':\n\n    cbind, rbind\n\n# load example dataset from mice\nhead(nhanes)\n\n  age  bmi hyp chl\n1   1   NA  NA  NA\n2   2 22.7   1 187\n3   1   NA   1 187\n4   3   NA  NA  NA\n5   1 20.4   1 113\n6   3   NA  NA 184\n\nsummary(nhanes)\n\n      age            bmi             hyp             chl       \n Min.   :1.00   Min.   :20.40   Min.   :1.000   Min.   :113.0  \n 1st Qu.:1.00   1st Qu.:22.65   1st Qu.:1.000   1st Qu.:185.0  \n Median :2.00   Median :26.75   Median :1.000   Median :187.0  \n Mean   :1.76   Mean   :26.56   Mean   :1.235   Mean   :191.4  \n 3rd Qu.:2.00   3rd Qu.:28.93   3rd Qu.:1.000   3rd Qu.:212.0  \n Max.   :3.00   Max.   :35.30   Max.   :2.000   Max.   :284.0  \n                NA's   :9       NA's   :8       NA's   :10     \n\n\n\nImpute with PMM\nTo impute with PMM is straightforward: specify the method, method = pmm.\n\nimp_pmm <- mice(nhanes, method = 'pmm', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\n# imputations for bmi\nimp_pmm$imp$bmi\n\n      1    2    3    4    5\n1  33.2 22.7 27.2 30.1 27.4\n3  26.3 27.2 26.3 35.3 35.3\n4  28.7 22.5 27.4 22.7 22.7\n6  25.5 24.9 20.4 22.7 27.4\n10 22.5 22.0 22.5 28.7 22.0\n11 29.6 27.4 33.2 35.3 28.7\n12 22.0 25.5 27.5 20.4 28.7\n16 29.6 27.2 30.1 35.3 27.4\n21 27.2 30.1 27.2 25.5 22.5\n\n\nAn alternative to the standard PMM is midastouch.\n\nimp_pmms <- mice(nhanes, method = 'midastouch', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\nimp_pmms$imp$bmi\n\n      1    2    3    4    5\n1  22.5 29.6 35.3 35.3 20.4\n3  30.1 29.6 27.5 30.1 22.0\n4  35.3 22.7 28.7 25.5 22.5\n6  35.3 22.7 24.9 25.5 22.5\n10 26.3 22.0 35.3 22.7 26.3\n11 22.5 22.5 29.6 30.1 20.4\n12 26.3 22.0 27.4 27.4 26.3\n16 22.5 22.5 29.6 30.1 22.0\n21 22.5 22.5 35.3 30.1 20.4"
+    "text": "Example\nWe use the small dataset nhanes included in mice package. It has 25 rows, and three out of four variables have missings.\nThe original NHANES data is a large national level survey, some are publicly available via R package nhanes.\n\nlibrary(mice)\n\nWarning in check_dep_version(): ABI version mismatch: \nlme4 was built with Matrix ABI version 1\nCurrent Matrix ABI version is 2\nPlease re-install lme4 from source or restore original 'Matrix' package\n\n\n\nAttaching package: 'mice'\n\n\nThe following object is masked from 'package:stats':\n\n    filter\n\n\nThe following objects are masked from 'package:base':\n\n    cbind, rbind\n\n# load example dataset from mice\nhead(nhanes)\n\n  age  bmi hyp chl\n1   1   NA  NA  NA\n2   2 22.7   1 187\n3   1   NA   1 187\n4   3   NA  NA  NA\n5   1 20.4   1 113\n6   3   NA  NA 184\n\nsummary(nhanes)\n\n      age            bmi             hyp             chl       \n Min.   :1.00   Min.   :20.40   Min.   :1.000   Min.   :113.0  \n 1st Qu.:1.00   1st Qu.:22.65   1st Qu.:1.000   1st Qu.:185.0  \n Median :2.00   Median :26.75   Median :1.000   Median :187.0  \n Mean   :1.76   Mean   :26.56   Mean   :1.235   Mean   :191.4  \n 3rd Qu.:2.00   3rd Qu.:28.93   3rd Qu.:1.000   3rd Qu.:212.0  \n Max.   :3.00   Max.   :35.30   Max.   :2.000   Max.   :284.0  \n                NA's   :9       NA's   :8       NA's   :10     \n\n\n\nImpute with PMM\nTo impute with PMM is straightforward: specify the method, method = pmm.\n\nimp_pmm <- mice(nhanes, method = 'pmm', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\n# imputations for bmi\nimp_pmm$imp$bmi\n\n      1    2    3    4    5\n1  29.6 28.7 27.4 29.6 33.2\n3  27.2 30.1 35.3 30.1 22.0\n4  22.5 20.4 27.5 27.4 22.7\n6  20.4 27.4 21.7 22.5 21.7\n10 22.0 22.7 25.5 20.4 22.7\n11 27.4 27.2 29.6 35.3 30.1\n12 27.4 30.1 27.4 22.0 26.3\n16 30.1 27.2 30.1 27.4 24.9\n21 27.2 30.1 35.3 30.1 22.5\n\n\nAn alternative to the standard PMM is midastouch.\n\nimp_pmms <- mice(nhanes, method = 'midastouch', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\nimp_pmms$imp$bmi\n\n      1    2    3    4    5\n1  30.1 28.7 30.1 20.4 28.7\n3  30.1 33.2 30.1 30.1 28.7\n4  24.9 22.5 25.5 22.5 25.5\n6  24.9 22.5 25.5 21.7 27.4\n10 30.1 27.4 22.7 26.3 28.7\n11 30.1 28.7 30.1 30.1 24.9\n12 30.1 27.4 22.0 26.3 28.7\n16 30.1 28.7 20.4 28.7 27.5\n21 33.2 22.5 28.7 20.4 27.5"
   },
   {
     "objectID": "R/association.html",
@@ -1747,7 +1747,7 @@
     "href": "python/logistic_regression.html#statsmodels-package",
     "title": "Logistic Regression",
     "section": "Statsmodels package",
-    "text": "Statsmodels package\nWe will use the sm.Logit() method to fit our logistic regression model.\n\n#intercept column\nx_sm = sm.add_constant(x)\n\n#fit model\nlr_sm = sm.Logit(y, x_sm).fit() \nprint(lr_sm.summary())\n\nOptimization terminated successfully.\n         Current function value: 0.568825\n         Iterations 5\n                           Logit Regression Results                           \n==============================================================================\nDep. Variable:                 wt_grp   No. Observations:                  167\nModel:                          Logit   Df Residuals:                      162\nMethod:                           MLE   Df Model:                            4\nDate:                Tue, 06 Aug 2024   Pseudo R-squ.:                 0.05169\nTime:                        15:06:19   Log-Likelihood:                -94.994\nconverged:                       True   LL-Null:                       -100.17\nCovariance Type:            nonrobust   LLR p-value:                   0.03484\n==============================================================================\n                 coef    std err          z      P>|z|      [0.025      0.975]\n------------------------------------------------------------------------------\nconst          3.3576      1.654      2.029      0.042       0.115       6.600\nage           -0.0126      0.021     -0.598      0.550      -0.054       0.029\nsex           -0.8645      0.371     -2.328      0.020      -1.592      -0.137\nph.ecog        0.4182      0.263      1.592      0.111      -0.097       0.933\nmeal.cal      -0.0009      0.000     -1.932      0.053      -0.002    1.27e-05\n==============================================================================\n\n\n\nModel fitting\nIn addition to the information contained in the summary, we can display the model coefficients as odds ratios:\n\nprint(\"Odds ratios for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.params))\n\nOdds ratios for statsmodels logistic regression:\nconst       28.719651\nage          0.987467\nsex          0.421266\nph.ecog      1.519198\nmeal.cal     0.999140\ndtype: float64\n\n\nWe can also provide the 5% confidence intervals for the odds ratios:\n\nprint(\"CI at 5% for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.conf_int(alpha = 0.05)))\n\nCI at 5% for statsmodels logistic regression:\n                 0           1\nconst     1.121742  735.301118\nage       0.947449    1.029175\nsex       0.203432    0.872354\nph.ecog   0.907984    2.541852\nmeal.cal  0.998269    1.000013\n\n\n\n\nPrediction\nLet’s use our trained model to make a weight loss prediction about a new patient.\n\n# new female, symptomatic but completely ambulatory patient consuming 2500 calories\nnew_pt = pd.DataFrame({\n    \"age\": [56],\n    \"sex\": [2],\n    \"ph.ecog\": [1.00], \n    \"meal.cal\": [2500]\n})\n\n# Add intercept term to the new data; for a single row this should be \n# forced using the `add_constant` command\nnew_pt_sm = sm.add_constant(new_pt, has_constant=\"add\")\nprint(\"Probability of weight loss using the statsmodels package:\")\nprint(lr_sm.predict(new_pt_sm))\n\nProbability of weight loss using the statsmodels package:\n0    0.308057\ndtype: float64"
+    "text": "Statsmodels package\nWe will use the sm.Logit() method to fit our logistic regression model.\n\n#intercept column\nx_sm = sm.add_constant(x)\n\n#fit model\nlr_sm = sm.Logit(y, x_sm).fit() \nprint(lr_sm.summary())\n\nOptimization terminated successfully.\n         Current function value: 0.568825\n         Iterations 5\n                           Logit Regression Results                           \n==============================================================================\nDep. Variable:                 wt_grp   No. Observations:                  167\nModel:                          Logit   Df Residuals:                      162\nMethod:                           MLE   Df Model:                            4\nDate:                Tue, 06 Aug 2024   Pseudo R-squ.:                 0.05169\nTime:                        15:09:23   Log-Likelihood:                -94.994\nconverged:                       True   LL-Null:                       -100.17\nCovariance Type:            nonrobust   LLR p-value:                   0.03484\n==============================================================================\n                 coef    std err          z      P>|z|      [0.025      0.975]\n------------------------------------------------------------------------------\nconst          3.3576      1.654      2.029      0.042       0.115       6.600\nage           -0.0126      0.021     -0.598      0.550      -0.054       0.029\nsex           -0.8645      0.371     -2.328      0.020      -1.592      -0.137\nph.ecog        0.4182      0.263      1.592      0.111      -0.097       0.933\nmeal.cal      -0.0009      0.000     -1.932      0.053      -0.002    1.27e-05\n==============================================================================\n\n\n\nModel fitting\nIn addition to the information contained in the summary, we can display the model coefficients as odds ratios:\n\nprint(\"Odds ratios for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.params))\n\nOdds ratios for statsmodels logistic regression:\nconst       28.719651\nage          0.987467\nsex          0.421266\nph.ecog      1.519198\nmeal.cal     0.999140\ndtype: float64\n\n\nWe can also provide the 5% confidence intervals for the odds ratios:\n\nprint(\"CI at 5% for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.conf_int(alpha = 0.05)))\n\nCI at 5% for statsmodels logistic regression:\n                 0           1\nconst     1.121742  735.301118\nage       0.947449    1.029175\nsex       0.203432    0.872354\nph.ecog   0.907984    2.541852\nmeal.cal  0.998269    1.000013\n\n\n\n\nPrediction\nLet’s use our trained model to make a weight loss prediction about a new patient.\n\n# new female, symptomatic but completely ambulatory patient consuming 2500 calories\nnew_pt = pd.DataFrame({\n    \"age\": [56],\n    \"sex\": [2],\n    \"ph.ecog\": [1.00], \n    \"meal.cal\": [2500]\n})\n\n# Add intercept term to the new data; for a single row this should be \n# forced using the `add_constant` command\nnew_pt_sm = sm.add_constant(new_pt, has_constant=\"add\")\nprint(\"Probability of weight loss using the statsmodels package:\")\nprint(lr_sm.predict(new_pt_sm))\n\nProbability of weight loss using the statsmodels package:\n0    0.308057\ndtype: float64"
   },
   {
     "objectID": "python/logistic_regression.html#scikit-learn-package",
@@ -1789,7 +1789,7 @@
     "href": "python/ancova.html#ancova-in-python",
     "title": "Ancova",
     "section": "Ancova in Python",
-    "text": "Ancova in Python\nIn Python, Ancova can be performed using the statsmodels library from the scipy package.\n\nimport statsmodels.api as sm\nimport statsmodels.formula.api as smf\nfrom tabulate import tabulate\n\n# Fit the ANCOVA model\nmodel_ancova = smf.ols('post ~ drug + pre', data=df).fit()\n\n# Summary of the model\nmodel_summary = model_ancova.summary()\nprint(model_summary)\n\n# Extracting glance (summary) information\nmodel_glance = {\n    'r_squared': model_ancova.rsquared,\n    'adj_r_squared': model_ancova.rsquared_adj,\n    'f_statistic': model_ancova.fvalue,\n    'f_pvalue': model_ancova.f_pvalue,\n    'aic': model_ancova.aic,\n    'bic': model_ancova.bic\n}\nmodel_glance_df = pd.DataFrame([model_glance])\nprint(tabulate(model_glance_df, headers='keys', tablefmt='grid'))\n\n# Extracting tidy (coefficients) information\nmodel_tidy = model_ancova.summary2().tables[1]\nprint(tabulate(model_tidy, headers='keys', tablefmt='grid'))\n\n                            OLS Regression Results                            \n==============================================================================\nDep. Variable:                   post   R-squared:                       0.676\nModel:                            OLS   Adj. R-squared:                  0.639\nMethod:                 Least Squares   F-statistic:                     18.10\nDate:                Tue, 06 Aug 2024   Prob (F-statistic):           1.50e-06\nTime:                        15:06:24   Log-Likelihood:                -82.054\nNo. Observations:                  30   AIC:                             172.1\nDf Residuals:                      26   BIC:                             177.7\nDf Model:                           3                                         \nCovariance Type:            nonrobust                                         \n==============================================================================\n                 coef    std err          t      P>|t|      [0.025      0.975]\n------------------------------------------------------------------------------\nIntercept     -3.8808      1.986     -1.954      0.062      -7.964       0.202\ndrug[T.D]      0.1090      1.795      0.061      0.952      -3.581       3.799\ndrug[T.F]      3.4461      1.887      1.826      0.079      -0.432       7.324\npre            0.9872      0.164      6.001      0.000       0.649       1.325\n==============================================================================\nOmnibus:                        2.609   Durbin-Watson:                   2.526\nProb(Omnibus):                  0.271   Jarque-Bera (JB):                2.148\nSkew:                           0.645   Prob(JB):                        0.342\nKurtosis:                       2.765   Cond. No.                         39.8\n==============================================================================\n\nNotes:\n[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n|    |   r_squared |   adj_r_squared |   f_statistic |    f_pvalue |     aic |     bic |\n+====+=============+=================+===============+=============+=========+=========+\n|  0 |    0.676261 |        0.638906 |       18.1039 | 1.50137e-06 | 172.108 | 177.712 |\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n|           |     Coef. |   Std.Err. |          t |       P>|t| |    [0.025 |   0.975] |\n+===========+===========+============+============+=============+===========+==========+\n| Intercept | -3.88081  |   1.9862   | -1.95388   | 0.0615519   | -7.96351  | 0.201887 |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.D] |  0.108971 |   1.79514  |  0.0607037 | 0.952059    | -3.58098  | 3.79892  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.F] |  3.44614  |   1.88678  |  1.82646   | 0.0792846   | -0.432195 | 7.32447  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| pre       |  0.987184 |   0.164498 |  6.00121   | 2.45433e-06 |  0.649054 | 1.32531  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n\n\nPlease note that all values match with the corresponding R version, except for the AIC and BIC values, which differ slightly. This should be acceptable for most practical purposes in statistical analysis. Currently, there are ongoing discussions in the statsmodels community regarding the computational details of AIC and BIC.\nThe following code can be used to enforce complete consistency of AIC and BIC values with R outputs by adding 1 to the number of parameters:\n\nimport numpy as np\n\n# Manual calculation of AIC and BIC to ensure consistency with R\nn = df.shape[0]  # number of observations\nk = model_ancova.df_model + 1  # number of parameters (including intercept)\nlog_lik = model_ancova.llf  # log-likelihood\n\n# Adjusted number of parameters (including scale parameter)\nk_adjusted = k + 1\n\n# Manually calculate AIC and BIC to match R's behavior\naic_adjusted = 2 * k_adjusted - 2 * log_lik\nbic_adjusted = np.log(n) * k_adjusted - 2 * log_lik\n\nprint(f\"Number of observations (n): {n}\")\nprint(f\"Number of parameters (k_adjusted): {k_adjusted}\")\nprint(f\"Log-likelihood: {log_lik}\")\nprint(f\"AIC (adjusted): {aic_adjusted}\")\nprint(f\"BIC (adjusted): {bic_adjusted}\")\n\nNumber of observations (n): 30\nNumber of parameters (k_adjusted): 5.0\nLog-likelihood: -82.0537744890265\nAIC (adjusted): 174.107548978053\nBIC (adjusted): 181.11353588636376\n\n\nThere are different types of anova computations. The statsmodels.stats.anova.anova_lm function allows the types 1, 2 and 3. The code to compute these types is depicted below:\n\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.anova as ssa\n\n# Center the predictor for Type III anova\n#df['pre_centered'] = df['pre'] - df['pre'].mean()\n\n# Fit the model for types I and II anova\nmodel = smf.ols('post ~ C(drug) + pre', data=df).fit()\n\n# Perform anova for types I and II\nancova_table_type_1 = ssa.anova_lm(model, typ=1)\nancova_table_type_2 = ssa.anova_lm(model, typ=2)\n\n# Fit the model for Type III anova with centered predictors\nmodel_type_3 = smf.ols('post ~ C(drug) + pre', data=df).fit()\nancova_table_type_3 = ssa.anova_lm(model_type_3, typ=3)\n\n# Calculate SSd (sum of squares for residuals)\nssd_type1 = ancova_table_type_1['sum_sq'].loc['Residual']\nssd_type2 = ancova_table_type_2['sum_sq'].loc['Residual']\nssd_type3 = ancova_table_type_3['sum_sq'].loc['Residual']\n\n# Calculate ges\nancova_table_type_1['ges'] = ancova_table_type_1['sum_sq'] / (ancova_table_type_1['sum_sq'] + ssd_type1)\nancova_table_type_2['ges'] = ancova_table_type_2['sum_sq'] / (ancova_table_type_2['sum_sq'] + ssd_type2)\nancova_table_type_3['ges'] = ancova_table_type_3['sum_sq'] / (ancova_table_type_3['sum_sq'] + ssd_type3)\n\n# Add SSd column\nancova_table_type_1['SSd'] = ssd_type1\nancova_table_type_2['SSd'] = ssd_type2\nancova_table_type_3['SSd'] = ssd_type3\n\n# Add significance column\nancova_table_type_1['p<0.05'] = ancova_table_type_1['PR(>F)'] < 0.05\nancova_table_type_2['p<0.05'] = ancova_table_type_2['PR(>F)'] < 0.05\nancova_table_type_3['p<0.05'] = ancova_table_type_3['PR(>F)'] < 0.05\n\n# Rename columns to match the R output\nancova_table_type_1.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_1.reset_index(inplace=True)\nancova_table_type_1.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_2.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_2.reset_index(inplace=True)\nancova_table_type_2.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_3.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_3.reset_index(inplace=True)\nancova_table_type_3.rename(columns={'index': 'Effect'}, inplace=True)\n\n# Calculate DFd (degrees of freedom for residuals)\ndfd_type1 = ancova_table_type_1.loc[ancova_table_type_1['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type2 = ancova_table_type_2.loc[ancova_table_type_2['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type3 = ancova_table_type_3.loc[ancova_table_type_3['Effect'] == 'Residual', 'DFn'].values[0]\nancova_table_type_1['DFd'] = dfd_type1\nancova_table_type_2['DFd'] = dfd_type2\nancova_table_type_3['DFd'] = dfd_type3\n\n# Filter out the Residual row\nancova_table_type_1 = ancova_table_type_1[ancova_table_type_1['Effect'] != 'Residual']\nancova_table_type_2 = ancova_table_type_2[ancova_table_type_2['Effect'] != 'Residual']\nancova_table_type_3 = ancova_table_type_3[ancova_table_type_3['Effect'] != 'Residual']\n\n# Select and reorder columns to match the R output\nancova_table_type_1 = ancova_table_type_1[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_2 = ancova_table_type_2[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_3 = ancova_table_type_3[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]"
+    "text": "Ancova in Python\nIn Python, Ancova can be performed using the statsmodels library from the scipy package.\n\nimport statsmodels.api as sm\nimport statsmodels.formula.api as smf\nfrom tabulate import tabulate\n\n# Fit the ANCOVA model\nmodel_ancova = smf.ols('post ~ drug + pre', data=df).fit()\n\n# Summary of the model\nmodel_summary = model_ancova.summary()\nprint(model_summary)\n\n# Extracting glance (summary) information\nmodel_glance = {\n    'r_squared': model_ancova.rsquared,\n    'adj_r_squared': model_ancova.rsquared_adj,\n    'f_statistic': model_ancova.fvalue,\n    'f_pvalue': model_ancova.f_pvalue,\n    'aic': model_ancova.aic,\n    'bic': model_ancova.bic\n}\nmodel_glance_df = pd.DataFrame([model_glance])\nprint(tabulate(model_glance_df, headers='keys', tablefmt='grid'))\n\n# Extracting tidy (coefficients) information\nmodel_tidy = model_ancova.summary2().tables[1]\nprint(tabulate(model_tidy, headers='keys', tablefmt='grid'))\n\n                            OLS Regression Results                            \n==============================================================================\nDep. Variable:                   post   R-squared:                       0.676\nModel:                            OLS   Adj. R-squared:                  0.639\nMethod:                 Least Squares   F-statistic:                     18.10\nDate:                Tue, 06 Aug 2024   Prob (F-statistic):           1.50e-06\nTime:                        15:09:28   Log-Likelihood:                -82.054\nNo. Observations:                  30   AIC:                             172.1\nDf Residuals:                      26   BIC:                             177.7\nDf Model:                           3                                         \nCovariance Type:            nonrobust                                         \n==============================================================================\n                 coef    std err          t      P>|t|      [0.025      0.975]\n------------------------------------------------------------------------------\nIntercept     -3.8808      1.986     -1.954      0.062      -7.964       0.202\ndrug[T.D]      0.1090      1.795      0.061      0.952      -3.581       3.799\ndrug[T.F]      3.4461      1.887      1.826      0.079      -0.432       7.324\npre            0.9872      0.164      6.001      0.000       0.649       1.325\n==============================================================================\nOmnibus:                        2.609   Durbin-Watson:                   2.526\nProb(Omnibus):                  0.271   Jarque-Bera (JB):                2.148\nSkew:                           0.645   Prob(JB):                        0.342\nKurtosis:                       2.765   Cond. No.                         39.8\n==============================================================================\n\nNotes:\n[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n|    |   r_squared |   adj_r_squared |   f_statistic |    f_pvalue |     aic |     bic |\n+====+=============+=================+===============+=============+=========+=========+\n|  0 |    0.676261 |        0.638906 |       18.1039 | 1.50137e-06 | 172.108 | 177.712 |\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n|           |     Coef. |   Std.Err. |          t |       P>|t| |    [0.025 |   0.975] |\n+===========+===========+============+============+=============+===========+==========+\n| Intercept | -3.88081  |   1.9862   | -1.95388   | 0.0615519   | -7.96351  | 0.201887 |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.D] |  0.108971 |   1.79514  |  0.0607037 | 0.952059    | -3.58098  | 3.79892  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.F] |  3.44614  |   1.88678  |  1.82646   | 0.0792846   | -0.432195 | 7.32447  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| pre       |  0.987184 |   0.164498 |  6.00121   | 2.45433e-06 |  0.649054 | 1.32531  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n\n\nPlease note that all values match with the corresponding R version, except for the AIC and BIC values, which differ slightly. This should be acceptable for most practical purposes in statistical analysis. Currently, there are ongoing discussions in the statsmodels community regarding the computational details of AIC and BIC.\nThe following code can be used to enforce complete consistency of AIC and BIC values with R outputs by adding 1 to the number of parameters:\n\nimport numpy as np\n\n# Manual calculation of AIC and BIC to ensure consistency with R\nn = df.shape[0]  # number of observations\nk = model_ancova.df_model + 1  # number of parameters (including intercept)\nlog_lik = model_ancova.llf  # log-likelihood\n\n# Adjusted number of parameters (including scale parameter)\nk_adjusted = k + 1\n\n# Manually calculate AIC and BIC to match R's behavior\naic_adjusted = 2 * k_adjusted - 2 * log_lik\nbic_adjusted = np.log(n) * k_adjusted - 2 * log_lik\n\nprint(f\"Number of observations (n): {n}\")\nprint(f\"Number of parameters (k_adjusted): {k_adjusted}\")\nprint(f\"Log-likelihood: {log_lik}\")\nprint(f\"AIC (adjusted): {aic_adjusted}\")\nprint(f\"BIC (adjusted): {bic_adjusted}\")\n\nNumber of observations (n): 30\nNumber of parameters (k_adjusted): 5.0\nLog-likelihood: -82.0537744890265\nAIC (adjusted): 174.107548978053\nBIC (adjusted): 181.11353588636376\n\n\nThere are different types of anova computations. The statsmodels.stats.anova.anova_lm function allows the types 1, 2 and 3. The code to compute these types is depicted below:\n\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.anova as ssa\n\n# Center the predictor for Type III anova\n#df['pre_centered'] = df['pre'] - df['pre'].mean()\n\n# Fit the model for types I and II anova\nmodel = smf.ols('post ~ C(drug) + pre', data=df).fit()\n\n# Perform anova for types I and II\nancova_table_type_1 = ssa.anova_lm(model, typ=1)\nancova_table_type_2 = ssa.anova_lm(model, typ=2)\n\n# Fit the model for Type III anova with centered predictors\nmodel_type_3 = smf.ols('post ~ C(drug) + pre', data=df).fit()\nancova_table_type_3 = ssa.anova_lm(model_type_3, typ=3)\n\n# Calculate SSd (sum of squares for residuals)\nssd_type1 = ancova_table_type_1['sum_sq'].loc['Residual']\nssd_type2 = ancova_table_type_2['sum_sq'].loc['Residual']\nssd_type3 = ancova_table_type_3['sum_sq'].loc['Residual']\n\n# Calculate ges\nancova_table_type_1['ges'] = ancova_table_type_1['sum_sq'] / (ancova_table_type_1['sum_sq'] + ssd_type1)\nancova_table_type_2['ges'] = ancova_table_type_2['sum_sq'] / (ancova_table_type_2['sum_sq'] + ssd_type2)\nancova_table_type_3['ges'] = ancova_table_type_3['sum_sq'] / (ancova_table_type_3['sum_sq'] + ssd_type3)\n\n# Add SSd column\nancova_table_type_1['SSd'] = ssd_type1\nancova_table_type_2['SSd'] = ssd_type2\nancova_table_type_3['SSd'] = ssd_type3\n\n# Add significance column\nancova_table_type_1['p<0.05'] = ancova_table_type_1['PR(>F)'] < 0.05\nancova_table_type_2['p<0.05'] = ancova_table_type_2['PR(>F)'] < 0.05\nancova_table_type_3['p<0.05'] = ancova_table_type_3['PR(>F)'] < 0.05\n\n# Rename columns to match the R output\nancova_table_type_1.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_1.reset_index(inplace=True)\nancova_table_type_1.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_2.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_2.reset_index(inplace=True)\nancova_table_type_2.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_3.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_3.reset_index(inplace=True)\nancova_table_type_3.rename(columns={'index': 'Effect'}, inplace=True)\n\n# Calculate DFd (degrees of freedom for residuals)\ndfd_type1 = ancova_table_type_1.loc[ancova_table_type_1['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type2 = ancova_table_type_2.loc[ancova_table_type_2['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type3 = ancova_table_type_3.loc[ancova_table_type_3['Effect'] == 'Residual', 'DFn'].values[0]\nancova_table_type_1['DFd'] = dfd_type1\nancova_table_type_2['DFd'] = dfd_type2\nancova_table_type_3['DFd'] = dfd_type3\n\n# Filter out the Residual row\nancova_table_type_1 = ancova_table_type_1[ancova_table_type_1['Effect'] != 'Residual']\nancova_table_type_2 = ancova_table_type_2[ancova_table_type_2['Effect'] != 'Residual']\nancova_table_type_3 = ancova_table_type_3[ancova_table_type_3['Effect'] != 'Residual']\n\n# Select and reorder columns to match the R output\nancova_table_type_1 = ancova_table_type_1[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_2 = ancova_table_type_2[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_3 = ancova_table_type_3[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]"
   },
   {
     "objectID": "python/ancova.html#type-1-ancova-in-python",