From 21f8cc3230ddc6b0ee5dd86c51769c035f7793e8 Mon Sep 17 00:00:00 2001 From: statasaurus <55274484+statasaurus@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:37:35 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20PSIAIMS/?= =?UTF-8?q?CAMIS@30d7ff0ad0783003bf42a5212f51f2f57a66c51f=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Comp/r-sas_cmh.html | 106 ++--- Comp/r-sas_survey-stats-summary.html | 106 ++--- R/PCA_analysis.html | 4 +- R/ancova.html | 636 +++++++++++++-------------- R/cmh.html | 106 ++--- R/mi_mar_predictive_mean_match.html | 36 +- R/survey-stats-summary.html | 212 ++++----- SAS/survey-stats-summary.html | 212 ++++----- blogs/index.html | 6 +- index.html | 108 ++--- minutes/index.html | 38 +- python/MANOVA.html | 2 +- python/Rounding.html | 2 +- python/Summary_statistics.html | 2 +- python/ancova.html | 18 +- python/anova.html | 2 +- python/correlation.html | 6 +- python/kruskal_wallis.html | 4 +- python/linear_regression.html | 8 +- python/logistic_regression.html | 22 +- python/one_sample_t_test.html | 4 +- python/paired_t_test.html | 4 +- python/skewness_kurtosis.html | 6 +- python/two_samples_t_test.html | 6 +- search.json | 10 +- 25 files changed, 833 insertions(+), 833 deletions(-) diff --git a/Comp/r-sas_cmh.html b/Comp/r-sas_cmh.html index 2bb4b01b..975431ae 100644 --- a/Comp/r-sas_cmh.html +++ b/Comp/r-sas_cmh.html @@ -348,23 +348,23 @@

CMH Statistics

As it can be seen, there are two schemata where R does not provide any results:

-
- diff --git a/Comp/r-sas_survey-stats-summary.html b/Comp/r-sas_survey-stats-summary.html index 4b26382a..87a4b01c 100644 --- a/Comp/r-sas_survey-stats-summary.html +++ b/Comp/r-sas_survey-stats-summary.html @@ -639,23 +639,23 @@

Quantiles

head(apisrs) |> gt::gt()
-
- diff --git a/R/PCA_analysis.html b/R/PCA_analysis.html index 352d4d8f..e50ee150 100644 --- a/R/PCA_analysis.html +++ b/R/PCA_analysis.html @@ -364,8 +364,8 @@

Vi scene = list(bgcolor = "lightgray")) fig

-
- +
+
diff --git a/R/ancova.html b/R/ancova.html index 79bf5d8c..12ac1a0a 100644 --- a/R/ancova.html +++ b/R/ancova.html @@ -272,23 +272,23 @@

The Model

model_tidy <- model_ancova %>% tidy() model_glance %>% gt()
-
- @@ -752,23 +752,23 @@

The Model

model_tidy   %>% gt()
-
- @@ -1246,23 +1246,23 @@

The Model

add_row(term = "Total", df = sum(.$df), sumsq = sum(.$sumsq)) model_table %>% gt()
-
- @@ -1745,23 +1745,23 @@

Type 1

get_anova_table() %>% gt()
-
- @@ -2238,23 +2238,23 @@

Type 2

get_anova_table() %>% gt()
-
- @@ -2731,23 +2731,23 @@

Type 3

get_anova_table() %>% gt()
-
- diff --git a/R/cmh.html b/R/cmh.html index 8079cbbc..c99f4426 100644 --- a/R/cmh.html +++ b/R/cmh.html @@ -240,23 +240,23 @@

Available R packages<

We did not find any R package that delivers all the same measures as SAS at once. Therefore, we tried out multiple packages:

-
- diff --git a/R/mi_mar_predictive_mean_match.html b/R/mi_mar_predictive_mean_match.html index c1575822..d8d9d875 100644 --- a/R/mi_mar_predictive_mean_match.html +++ b/R/mi_mar_predictive_mean_match.html @@ -368,15 +368,15 @@

Impute with PMM

imp_pmm$imp$bmi
      1    2    3    4    5
-1  30.1 30.1 27.2 28.7 20.4
-3  29.6 33.2 28.7 27.2 26.3
-4  26.3 27.4 22.5 21.7 25.5
-6  20.4 20.4 24.9 25.5 22.5
-10 25.5 24.9 22.7 35.3 22.5
-11 35.3 27.2 33.2 22.0 22.0
-12 25.5 30.1 22.0 22.0 30.1
-16 27.5 26.3 28.7 33.2 26.3
-21 29.6 30.1 27.2 28.7 35.3
+1 22.7 28.7 24.9 22.5 29.6 +3 35.3 30.1 28.7 28.7 28.7 +4 27.4 24.9 25.5 25.5 27.4 +6 20.4 24.9 21.7 24.9 21.7 +10 22.0 20.4 22.5 28.7 27.5 +11 20.4 22.0 27.2 33.2 26.3 +12 27.4 27.5 20.4 24.9 20.4 +16 22.5 27.4 29.6 28.7 24.9 +21 24.9 25.5 21.7 33.2 26.3

An alternative to the standard PMM is midastouch.

@@ -453,15 +453,15 @@

Impute with PMM

imp_pmms$imp$bmi
      1    2    3    4    5
-1  27.4 35.3 26.3 27.5 29.6
-3  29.6 29.6 26.3 26.3 29.6
-4  25.5 27.2 22.7 24.9 27.4
-6  20.4 21.7 21.7 24.9 27.4
-10 28.7 28.7 28.7 26.3 28.7
-11 30.1 29.6 26.3 27.5 35.3
-12 28.7 28.7 22.7 26.3 28.7
-16 29.6 30.1 26.3 26.3 35.3
-21 29.6 30.1 26.3 29.6 20.4
+1 35.3 35.3 20.4 20.4 29.6 +3 30.1 30.1 30.1 29.6 22.0 +4 28.7 25.5 25.5 21.7 21.7 +6 22.5 25.5 21.7 21.7 21.7 +10 28.7 27.5 22.0 21.7 22.5 +11 30.1 35.3 20.4 29.6 29.6 +12 27.4 22.0 28.7 21.7 26.3 +16 33.2 35.3 20.4 29.6 29.6 +21 20.4 33.2 20.4 29.6 29.6
diff --git a/R/survey-stats-summary.html b/R/survey-stats-summary.html index 09336b34..93044e5b 100644 --- a/R/survey-stats-summary.html +++ b/R/survey-stats-summary.html @@ -267,23 +267,23 @@

Simple Survey Designs

head(apisrs) |> gt::gt()
-
- @@ -1127,23 +1127,23 @@

Summary Statistics on Complex Survey Designs

head(nhanes) |> gt::gt()
-
- diff --git a/SAS/survey-stats-summary.html b/SAS/survey-stats-summary.html index 41007a9c..5c26adb3 100644 --- a/SAS/survey-stats-summary.html +++ b/SAS/survey-stats-summary.html @@ -262,23 +262,23 @@

Simple Survey Designs

We will use the API dataset (“API Data Files” 2006), which contains a number of datasets based on different samples from a dataset of academic performance. Initially we will just cover the methodology with a simple random sample and a finite population correction to demonstrate functionality.

-
- @@ -1134,23 +1134,23 @@

Summary Statistics on Complex Survey Designs

Much of the previous examples and notes still stand for more complex survey designs, here we will demonstrate using a dataset from NHANES (“National Health and Nutrition Examination Survey Data” 2010), which uses both stratification and clustering:

-
- diff --git a/blogs/index.html b/blogs/index.html index a0fa294f..3edef5f4 100644 --- a/blogs/index.html +++ b/blogs/index.html @@ -243,7 +243,7 @@

Blogs

-
+
-
+
-
+

diff --git a/index.html b/index.html index 6df67a5d..96768546 100644 --- a/index.html +++ b/index.html @@ -203,23 +203,23 @@

Repository

The repository below provides examples of statistical methodology in different software and languages, along with a comparison of the results obtained and description of any discrepancies.

-
- @@ -703,7 +703,7 @@

Repository

ANOVA
R SAS - +Python R vs SAS diff --git a/minutes/index.html b/minutes/index.html index e33139ce..f641a7ef 100644 --- a/minutes/index.html +++ b/minutes/index.html @@ -247,7 +247,7 @@

Meeting Minutes

-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+

 
diff --git a/python/MANOVA.html b/python/MANOVA.html index 5870d83f..2afad2e5 100644 --- a/python/MANOVA.html +++ b/python/MANOVA.html @@ -228,7 +228,7 @@

MANOVA in Python

Example 39.6 Multivariate Analysis of Variance from SAS MANOVA User Guide

This example employs multivariate analysis of variance (MANOVA) to measure differences in the chemical characteristics of ancient pottery found at four kiln sites in Great Britain. The data are from Tubb, Parker, and Nickless (1980), as reported in Hand et al. (1994).

For each of 26 samples of pottery, the percentages of oxides of five metals are measured. The following statements create the data set and invoke the GLM procedure to perform a one-way MANOVA. Additionally, it is of interest to know whether the pottery from one site in Wales (Llanederyn) differs from the samples from other sites; a CONTRAST statement is used to test this hypothesis.

-
+
import pandas as pd
 from statsmodels.multivariate.manova import MANOVA
 
diff --git a/python/Rounding.html b/python/Rounding.html
index 3891a119..21c8b0ec 100644
--- a/python/Rounding.html
+++ b/python/Rounding.html
@@ -217,7 +217,7 @@ 

Rounding in Python

Python has a built-in round() function that takes two numeric arguments, number and ndigits, and returns a floating point number that is a rounded version of the number up to the specified number of decimals.

The default number of decimal is 0, meaning that the function will return the nearest integer.

-
+
# For integers
 x= 12
 print(round(x))
diff --git a/python/Summary_statistics.html b/python/Summary_statistics.html
index 159c24d7..e87c6119 100644
--- a/python/Summary_statistics.html
+++ b/python/Summary_statistics.html
@@ -222,7 +222,7 @@ 

Summary statistics

4.out (optional): An alternate output array where we can place the result.

5.overwrite_input (optional): Used to modify the input array.

6.keepdims (optional): Creates reduced axes with dimensions of one size.

-
+
import numpy as np
 
 sample_data=[12, 25, 16, 50, 34, 29, 60, 86, 52, 39, 41]
diff --git a/python/ancova.html b/python/ancova.html
index a77321c7..d3829437 100644
--- a/python/ancova.html
+++ b/python/ancova.html
@@ -232,7 +232,7 @@ 

Introduction

Data Summary

-
+
import pandas as pd
 
 # Input data
@@ -250,7 +250,7 @@ 

Data Summary

df = pd.DataFrame(data)
-
+
# Descriptive statistics
 summary_stats = df.describe()
 
@@ -279,7 +279,7 @@ 

Data Summary

Ancova in Python

In Python, Ancova can be performed using the statsmodels library from the scipy package.

-
+
import statsmodels.api as sm
 import statsmodels.formula.api as smf
 from tabulate import tabulate
@@ -313,7 +313,7 @@ 

Ancova in Python

Model: OLS Adj. R-squared: 0.639 Method: Least Squares F-statistic: 18.10 Date: Mon, 05 Aug 2024 Prob (F-statistic): 1.50e-06 -Time: 13:32:43 Log-Likelihood: -82.054 +Time: 13:33:17 Log-Likelihood: -82.054 No. Observations: 30 AIC: 172.1 Df Residuals: 26 BIC: 177.7 Df Model: 3 @@ -354,7 +354,7 @@

Ancova in Python

Please note that all values match with the corresponding R version, except for the AIC and BIC values, which differ slightly. This should be acceptable for most practical purposes in statistical analysis. Currently, there are ongoing discussions in the statsmodels community regarding the computational details of AIC and BIC.

The following code can be used to enforce complete consistency of AIC and BIC values with R outputs by adding 1 to the number of parameters:

-
+
import numpy as np
 
 # Manual calculation of AIC and BIC to ensure consistency with R
@@ -383,7 +383,7 @@ 

Ancova in Python

There are different types of anova computations. The statsmodels.stats.anova.anova_lm function allows the types 1, 2 and 3. The code to compute these types is depicted below:

-
+
import statsmodels.formula.api as smf
 import statsmodels.stats.anova as ssa
 
@@ -455,7 +455,7 @@ 

Ancova in Python

Type 1 Ancova in Python

-
+
print(tabulate(ancova_table_type_1, headers='keys', tablefmt='grid'))
+----+----------+-------+-------+---------+---------+----------+-------------+----------+----------+
@@ -470,7 +470,7 @@ 

Type 1 Ancova in P

Type 2 Ancova in Python

-
+
print(tabulate(ancova_table_type_2, headers='keys', tablefmt='grid'))
+----+----------+-------+-------+----------+---------+----------+-------------+----------+----------+
@@ -485,7 +485,7 @@ 

Type 2 Ancova in P

Type 3 Ancova in Python

-
+
print(tabulate(ancova_table_type_3, headers='keys', tablefmt='grid'))
+----+-----------+-------+-------+----------+---------+----------+-------------+----------+----------+
diff --git a/python/anova.html b/python/anova.html
index 54ca1adc..d628c480 100644
--- a/python/anova.html
+++ b/python/anova.html
@@ -233,7 +233,7 @@ 

Introduction

Anova Test in Python

To perform a one-way ANOVA test in Python we can use the f_oneway() function from SciPy library. Similarly, to perform two-way ANOVA test anova_lm() function from the statsmodel library is frequently used.

For this test, we’ll create a data frame called df_disease taken from the SAS documentation. The corresponding data can be found here. In this experiment, we are trying to find the impact of different drug and disease group on the stem-length

-
+
import pandas as pd
 import statsmodels.api as sm
 from statsmodels.formula.api import ols
diff --git a/python/correlation.html b/python/correlation.html
index bdef3b73..676a5ac1 100644
--- a/python/correlation.html
+++ b/python/correlation.html
@@ -228,7 +228,7 @@ 

Correlation Analysis in Python

Pearson’s Correlation

It is a parametric correlation test because it depends on the distribution of data. It measures the linear dependence between two variables x and y. It is the ratio between the covariance of two variables and the product of their standard deviation. The result always have a value between 1 and -1.

-
+
import pandas as pd
 from scipy.stats import pearsonr
 
@@ -250,7 +250,7 @@ 

Pearson’s Correlati

Kendall’s Rank

A τ test is a non-parametric hypothesis test for statistical dependence based on the τ coefficient. It is a measure of rank correlation: the similarity of the orderings of the data when ranked by each of the quantities. The Kendall correlation between two variables will be high when observations have a similar (or identical for a correlation of 1) rank (i.e. relative position label of the observations within the variable: 1st, 2nd, 3rd, etc.) between the two variables, and low when observations have a dissimilar (or fully different for a correlation of −1) rank between the two variables.

-
+
import pandas as pd
 from scipy.stats import kendalltau 
 
@@ -273,7 +273,7 @@ 

Kendall’s Rank

Spearman’s Rank

Spearman’s Rank Correlation is a statistical measure of the strength and direction of the monotonic relationship between two continuous variables. Therefore, these attributes are ranked or put in the order of their preference. It is denoted by the symbol “rho” (ρ) and can take values between -1 to +1. A positive value of rho indicates that there exists a positive relationship between the two variables, while a negative value of rho indicates a negative relationship. A rho value of 0 indicates no association between the two variables.

-
+
import pandas as pd
 from scipy.stats import spearmanr
  
diff --git a/python/kruskal_wallis.html b/python/kruskal_wallis.html
index 7e2f8bcf..5534a882 100644
--- a/python/kruskal_wallis.html
+++ b/python/kruskal_wallis.html
@@ -226,7 +226,7 @@ 

Kruskal Wallis in Python

Introduction

The Kruskal-Wallis test is a non-parametric equivalent to the one-way ANOVA. For this example, the data used is a subset of the iris dataset, testing for difference in sepal width between species of flower.

-
+
import pandas as pd
 
 # Define the data
@@ -266,7 +266,7 @@ 

Introduction

Implementing Kruskal-Wallis in Python

The Kruskal-Wallis test can be implemented in Python using the kruskal function from scipy.stats. The null hypothesis is that the samples are from identical populations.

-
+
from scipy.stats import kruskal
 
 # Separate the data for each species
diff --git a/python/linear_regression.html b/python/linear_regression.html
index 10330f95..88d0a0ca 100644
--- a/python/linear_regression.html
+++ b/python/linear_regression.html
@@ -229,7 +229,7 @@ 

Linear Regression

Descriptive Statistics

The first step is to obtain the simple descriptive statistics for the numeric variables of htwt data, and one-way frequencies for categorical variables. This is accomplished by employing summary function. There are 237 participants who are from 13.9 to 25 years old. It is a cross-sectional study, with each participant having one observation. We can use this data set to examine the relationship of participants’ height to their age and sex.

-
+
import pandas as pd
 import statsmodels.api as sm
 
@@ -237,7 +237,7 @@ 

Descriptive Statist htwt = pd.read_csv("../data/htwt.csv")

In order to create a regression model to demonstrate the relationship between age and height for females, we first need to create a flag variable identifying females and an interaction variable between age and female gender flag.

-
+
htwt['female'] = (htwt['SEX'] == 'f').astype(int)
 htwt['fem_age'] = htwt['AGE'] * htwt['female']
 htwt.head()
@@ -319,7 +319,7 @@

Descriptive Statist

Regression Analysis

Next, we fit a regression model, representing the relationships between gender, age, height and the interaction variable created in the datastep above. We again use a where statement to restrict the analysis to those who are less than or equal to 19 years old. We use the clb option to get a 95% confidence interval for each of the parameters in the model. The model that we are fitting is height = b0 + b1 x female + b2 x age + b3 x fem_age + e

-
+
X = htwt[['female', 'AGE', 'fem_age']][htwt['AGE'] <= 19]
 X = sm.add_constant(X)
 Y = htwt['HEIGHT'][htwt['AGE'] <= 19]
@@ -357,7 +357,7 @@ 

Regression Analysis Time: -13:32:46 +13:33:20 Log-Likelihood: -534.17 diff --git a/python/logistic_regression.html b/python/logistic_regression.html index 74b7c024..d3e16492 100644 --- a/python/logistic_regression.html +++ b/python/logistic_regression.html @@ -267,7 +267,7 @@

Logistic Regression

Imports

-
+
#data manipulation
 import pandas as pd
 import numpy as np
@@ -289,7 +289,7 @@ 

Background

Example : Lung cancer data

Data source: Loprinzi CL. Laurie JA. Wieand HS. Krook JE. Novotny PJ. Kugler JW. Bartel J. Law M. Bateman M. Klatt NE. et al. Prospective evaluation of prognostic variables from patient-completed questionnaires. North Central Cancer Treatment Group. Journal of Clinical Oncology. 12(3):601-7, 1994.

These data were sourced from the R package {survival} and have been downloaded and stored in the data folder.

-
+
# importing and prepare
 lung2 = pd.read_csv("../data/lung_cancer.csv")
 
@@ -302,7 +302,7 @@ 

Example : Lung cancer data

Logistic Regression Modelling

Let’s further prepare our data for modelling by selecting the explanatory variables and the dependent variable. The Python packages that we are are aware of require complete (i.e. no missing values) data so for convenience of demonstrating these methods we will drop rows with missing values.

-
+
x_vars = ["age", "sex", "ph.ecog", "meal.cal"]
 y_var = "wt_grp"
 
@@ -316,7 +316,7 @@ 

Logistic Regression Modelling

Statsmodels package

We will use the sm.Logit() method to fit our logistic regression model.

-
+
#intercept column
 x_sm = sm.add_constant(x)
 
@@ -333,7 +333,7 @@ 

Statsmodels packageStatsmodels package

Model fitting

In addition to the information contained in the summary, we can display the model coefficients as odds ratios:

-
+
print("Odds ratios for statsmodels logistic regression:")
 print(np.exp(lr_sm.params))
@@ -364,7 +364,7 @@

Model fitting

We can also provide the 5% confidence intervals for the odds ratios:

-
+
print("CI at 5% for statsmodels logistic regression:")
 print(np.exp(lr_sm.conf_int(alpha = 0.05)))
@@ -381,7 +381,7 @@

Model fitting

Prediction

Let’s use our trained model to make a weight loss prediction about a new patient.

-
+
# new female, symptomatic but completely ambulatory patient consuming 2500 calories
 new_pt = pd.DataFrame({
     "age": [56],
@@ -419,11 +419,11 @@ 

Scikit-learn Package<

It’s important to note that l2 regularisation is applied by default in the scikit-learn implementation of logistic regression. More recent releases of this package include an option to have no regularisation penalty.

-
+
lr_sk = LogisticRegression(penalty=None).fit(x, y)

Unlike the statsmodels approach scikit-learn doesn’t have a summary method for the model but you can extract some of the model parameters as follows:

-
+
print("Intercept for scikit learn logistic regression:")
 print(lr_sk.intercept_)
 print("Odds ratios for scikit learn logistic regression:")
@@ -439,7 +439,7 @@ 

Scikit-learn Package<

Prediction

Using the same new patient example we can use our logistic regression model to make a prediction. The predict_proba method is used to return the probability for each class. If you are interested in viewing the prediction for y = 1, i.e. the probability of weight loss then you can select the second probability as shown:

-
+
print("Probability of weight loss using the scikit-learn package:")
 print(lr_sk.predict_proba(new_pt)[:,1])
diff --git a/python/one_sample_t_test.html b/python/one_sample_t_test.html index cd7e54c7..a6e15365 100644 --- a/python/one_sample_t_test.html +++ b/python/one_sample_t_test.html @@ -233,7 +233,7 @@

One Sample t-t

Data Used

-
+
import pandas as pd
 
 # Create sample data
@@ -248,7 +248,7 @@ 

Data Used

subsubtitle: “t-test”


The following code was used to test the comparison in Python. Note that the baseline null hypothesis goes in the “popmean” parameter.

-
+
import pandas as pd
 from scipy import stats
 
diff --git a/python/paired_t_test.html b/python/paired_t_test.html
index 56f5b09d..2c89ad04 100644
--- a/python/paired_t_test.html
+++ b/python/paired_t_test.html
@@ -235,7 +235,7 @@ 

Paired t-test in P

Data Used

-
+
import pandas as pd
 
 # Create sample data
@@ -250,7 +250,7 @@ 

Data Used

Paired t-test

The following code was used to test the comparison in Python. Note that the baseline null hypothesis goes in the “popmean” parameter.

-
+
import pandas as pd
 from scipy import stats
 
diff --git a/python/skewness_kurtosis.html b/python/skewness_kurtosis.html
index b35061b6..1ced0cf2 100644
--- a/python/skewness_kurtosis.html
+++ b/python/skewness_kurtosis.html
@@ -258,7 +258,7 @@ 

Skewness and Kurtosis in Python

Skewness measures the the amount of asymmetry in a distribution, while Kurtosis describes the “tailedness” of the curve. These measures are frequently used to assess the normality of the data. There are several methods to calculate these measures. In Python, the packages pandas, scipy.stats.skew and scipy.stats.kurtosis can be used.

Data Used

-
+
import pandas as pd
 from scipy.stats import skew, kurtosis
 
@@ -288,7 +288,7 @@ 

Skewness

\]

All three skewness measures are unbiased under normality. The three methods are illustrated in the following code:

-
+
# Skewness
 type1_skew = skew(df['points'])
 type2_skew = df['points'].skew()
@@ -319,7 +319,7 @@ 

Kurtosis

\[b_2 = m_4/s^4-3 = (g_2 + 3)(1-1/n)^2-3\]

Only \(G_2\) (corresponding to type 2) is unbiased under normality. The three methods are illustrated in the following code:

-
+
# Kurtosis
 type1_kurt = kurtosis(df['points'])
 
diff --git a/python/two_samples_t_test.html b/python/two_samples_t_test.html
index aa15b8a1..ea768413 100644
--- a/python/two_samples_t_test.html
+++ b/python/two_samples_t_test.html
@@ -235,7 +235,7 @@ 

Two Sample t-test in Python

Data Used

The following data was used in this example.

-
+
import pandas as pd
 import numpy as np
 from scipy import stats
@@ -257,7 +257,7 @@ 

Student’s T-Test

Code

The following code was used to test the comparison in Python. Note that we must separate the single variable into two variables to satisfy the scipy stats package syntax.

-
+
# Separate data into two groups
 group1 = df[df['trt_grp'] == 'placebo']['WtGain']
 group2 = df[df['trt_grp'] == 'treatment']['WtGain']
@@ -281,7 +281,7 @@ 

Welch’s T-Test

Code

The following code was used to test the comparison in Python using Welch’s t-test.

-
+
# Perform Welch's t-test assuming unequal variances
 t_stat_welch, p_value_unequal_var = stats.ttest_ind(group1, group2, equal_var=False)
 
diff --git a/search.json b/search.json
index 77c5285c..968fda02 100644
--- a/search.json
+++ b/search.json
@@ -508,7 +508,7 @@
     "href": "index.html",
     "title": "CAMIS - A PHUSE DVOST Working Group",
     "section": "",
-    "text": "Introduction\nSeveral discrepancies have been discovered in statistical analysis results between different programming languages, even in fully qualified statistical computing environments. Subtle differences exist between the fundamental approaches implemented by each language, yielding differences in results which are each correct in their own right. The fact that these differences exist causes unease on the behalf of sponsor companies when submitting to a regulatory agency, as it is uncertain if the agency will view these differences as problematic. In its Statistical Software Clarifying Statement, the US Food and Drug Administration (FDA) states that it “FDA does not require use of any specific software for statistical analyses” and that “the computer software used for data management and statistical analysis should be reliable.” Observing differences across languages can reduce the analyst’s confidence in reliability and, by understanding the source of any discrepancies, one can reinstate confidence in reliability.\n\nMotivation\nThe goal of this project is to demystify conflicting results between software and to help ease the transitions to new languages by providing comparison and comprehensive explanations.\n\n\nRepository\nThe repository below provides examples of statistical methodology in different software and languages, along with a comparison of the results obtained and description of any discrepancies.\n\n\n\n\n\n\n\n\nMethods\nR\nSAS\nPython\nComparison\n\n\n\n\nSummary Statistics\nRounding\nR\nSAS\nPython\nR vs SAS\n\n\nSummary statistics\nR\nSAS\nPython\nR vs SAS\n\n\nSkewness/Kurtosis\nR\nSAS\nPython\nR vs SAS\n\n\nGeneral Linear Models\nOne Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nPaired t-test\nR\nSAS\nPython\nR vs SAS\n\n\nTwo Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nANOVA\nR\nSAS\n\nR vs SAS\n\n\nANCOVA\nR\nSAS\nPython\nR vs SAS\n\n\nMANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nLinear Regression\nR\nSAS\nPython\nR vs SAS\n\n\nGeneralized Linear Models\nLogistic Regression\nR\nSAS\nPython\nR vs SAS\n\n\nPoisson/Negative Binomial Regression\nR\n\n\nR vs SAS\n\n\nCategorical Repeated Measures\n\n\n\n\n\n\nCategorical Multiple Imputation\n\n\n\n\n\n\nNon-parametric Analysis\nWilcoxon signed rank\nR\n\n\n\n\n\nMann-Whitney U/Wilcoxon rank sum\nR\n\n\n\n\n\nKolmogorov-Smirnov test\n\n\n\n\n\n\nKruskall-Wallis test\nR\nSAS\nPython\nR vs SAS\n\n\nFriedman test\n\n\n\n\n\n\nJonckheere test\n\n\n\n\n\n\nHodges-Lehman Estimator\nR\nSAS\n\n\n\n\nCategorical Data Analysis\nBinomial test\nR\n\n\n\n\n\nMcNemar's test\nR\nSAS\n\nR vs SAS\n\n\nChi-Square Association/Fishers exact\nR\nSAS\n\nR vs SAS\n\n\nCochran Mantel Haenszel\nR\nSAS\n\nR vs SAS\n\n\nConfidence Intervals for proportions\nR\n\n\n\n\n\nRepeated Measures\nLinear Mixed Model (MMRM)\nR\nSAS\n\nR vs SAS\n\n\nGeneralized Linear Mixed Model (MMRM)\n\n\n\n\n\n\nBayesian MMRM\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MAR\nMCMC\n\n\n\n\n\n\nLinear regression\nR\n\n\n\n\n\nPredictive Mean Matching\nR\n\n\n\n\n\nPropensity Scores\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MNAR\nDelta Adjustment/Tipping Point\n\n\n\n\n\n\nReference-Based Imputation/Sequential Methods\n\n\n\n\n\n\nReference-Based Imputation/Joint Modelling\n\n\n\n\n\n\nCorrelation\nPearson's/ Spearman's/ Kendall's Rank\nR\nSAS\nPython\nR vs SAS\n\n\nSurvival Models\nKaplan-Meier Log-rank test and Cox-PH\nR\nSAS\n\nR vs SAS\n\n\nAccelerated Failure Time\nR\n\n\n\n\n\nNon-proportional hazards methods\n\n\n\n\n\n\nSample size /Power calculations\nSingle timepoint analysis\n\n\n\n\n\n\nGroup-sequential designs\n\n\n\n\n\n\nMultivariate methods\nClustering\n\n\n\n\n\n\nFactor analysis\n\n\n\n\n\n\nPCA\nR\n\n\n\n\n\nCanonical correlation\n\n\n\n\n\n\nPLS\n\n\n\n\n\n\nOther Methods\nSurvey statistics\nR\nSAS\n\nR vs SAS\n\n\nNearest neighbour\n\n\n\n\n\n\nCausal inference\n\n\n\n\n\n\nMachine learning"
+    "text": "Introduction\nSeveral discrepancies have been discovered in statistical analysis results between different programming languages, even in fully qualified statistical computing environments. Subtle differences exist between the fundamental approaches implemented by each language, yielding differences in results which are each correct in their own right. The fact that these differences exist causes unease on the behalf of sponsor companies when submitting to a regulatory agency, as it is uncertain if the agency will view these differences as problematic. In its Statistical Software Clarifying Statement, the US Food and Drug Administration (FDA) states that it “FDA does not require use of any specific software for statistical analyses” and that “the computer software used for data management and statistical analysis should be reliable.” Observing differences across languages can reduce the analyst’s confidence in reliability and, by understanding the source of any discrepancies, one can reinstate confidence in reliability.\n\nMotivation\nThe goal of this project is to demystify conflicting results between software and to help ease the transitions to new languages by providing comparison and comprehensive explanations.\n\n\nRepository\nThe repository below provides examples of statistical methodology in different software and languages, along with a comparison of the results obtained and description of any discrepancies.\n\n\n\n\n\n\n\n\nMethods\nR\nSAS\nPython\nComparison\n\n\n\n\nSummary Statistics\nRounding\nR\nSAS\nPython\nR vs SAS\n\n\nSummary statistics\nR\nSAS\nPython\nR vs SAS\n\n\nSkewness/Kurtosis\nR\nSAS\nPython\nR vs SAS\n\n\nGeneral Linear Models\nOne Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nPaired t-test\nR\nSAS\nPython\nR vs SAS\n\n\nTwo Sample t-test\nR\nSAS\nPython\nR vs SAS\n\n\nANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nANCOVA\nR\nSAS\nPython\nR vs SAS\n\n\nMANOVA\nR\nSAS\nPython\nR vs SAS\n\n\nLinear Regression\nR\nSAS\nPython\nR vs SAS\n\n\nGeneralized Linear Models\nLogistic Regression\nR\nSAS\nPython\nR vs SAS\n\n\nPoisson/Negative Binomial Regression\nR\n\n\nR vs SAS\n\n\nCategorical Repeated Measures\n\n\n\n\n\n\nCategorical Multiple Imputation\n\n\n\n\n\n\nNon-parametric Analysis\nWilcoxon signed rank\nR\n\n\n\n\n\nMann-Whitney U/Wilcoxon rank sum\nR\n\n\n\n\n\nKolmogorov-Smirnov test\n\n\n\n\n\n\nKruskall-Wallis test\nR\nSAS\nPython\nR vs SAS\n\n\nFriedman test\n\n\n\n\n\n\nJonckheere test\n\n\n\n\n\n\nHodges-Lehman Estimator\nR\nSAS\n\n\n\n\nCategorical Data Analysis\nBinomial test\nR\n\n\n\n\n\nMcNemar's test\nR\nSAS\n\nR vs SAS\n\n\nChi-Square Association/Fishers exact\nR\nSAS\n\nR vs SAS\n\n\nCochran Mantel Haenszel\nR\nSAS\n\nR vs SAS\n\n\nConfidence Intervals for proportions\nR\n\n\n\n\n\nRepeated Measures\nLinear Mixed Model (MMRM)\nR\nSAS\n\nR vs SAS\n\n\nGeneralized Linear Mixed Model (MMRM)\n\n\n\n\n\n\nBayesian MMRM\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MAR\nMCMC\n\n\n\n\n\n\nLinear regression\nR\n\n\n\n\n\nPredictive Mean Matching\nR\n\n\n\n\n\nPropensity Scores\n\n\n\n\n\n\nMultiple Imputation - Continuous Data MNAR\nDelta Adjustment/Tipping Point\n\n\n\n\n\n\nReference-Based Imputation/Sequential Methods\n\n\n\n\n\n\nReference-Based Imputation/Joint Modelling\n\n\n\n\n\n\nCorrelation\nPearson's/ Spearman's/ Kendall's Rank\nR\nSAS\nPython\nR vs SAS\n\n\nSurvival Models\nKaplan-Meier Log-rank test and Cox-PH\nR\nSAS\n\nR vs SAS\n\n\nAccelerated Failure Time\nR\n\n\n\n\n\nNon-proportional hazards methods\n\n\n\n\n\n\nSample size /Power calculations\nSingle timepoint analysis\n\n\n\n\n\n\nGroup-sequential designs\n\n\n\n\n\n\nMultivariate methods\nClustering\n\n\n\n\n\n\nFactor analysis\n\n\n\n\n\n\nPCA\nR\n\n\n\n\n\nCanonical correlation\n\n\n\n\n\n\nPLS\n\n\n\n\n\n\nOther Methods\nSurvey statistics\nR\nSAS\n\nR vs SAS\n\n\nNearest neighbour\n\n\n\n\n\n\nCausal inference\n\n\n\n\n\n\nMachine learning"
   },
   {
     "objectID": "contribution/contribution.html",
@@ -557,7 +557,7 @@
     "href": "python/linear_regression.html",
     "title": "Linear Regression",
     "section": "",
-    "text": "To demonstrate the use of linear regression we examine a dataset that illustrates the relationship between Height and Weight in a group of 237 teen-aged boys and girls. The dataset is available here and is imported to the workspace.\n\nDescriptive Statistics\nThe first step is to obtain the simple descriptive statistics for the numeric variables of htwt data, and one-way frequencies for categorical variables. This is accomplished by employing summary function. There are 237 participants who are from 13.9 to 25 years old. It is a cross-sectional study, with each participant having one observation. We can use this data set to examine the relationship of participants’ height to their age and sex.\n\nimport pandas as pd\nimport statsmodels.api as sm\n\n# Importing CSV\nhtwt = pd.read_csv(\"../data/htwt.csv\")\n\nIn order to create a regression model to demonstrate the relationship between age and height for females, we first need to create a flag variable identifying females and an interaction variable between age and female gender flag.\n\nhtwt['female'] = (htwt['SEX'] == 'f').astype(int)\nhtwt['fem_age'] = htwt['AGE'] * htwt['female']\nhtwt.head()\n\n\n\n\n\n\n\n\nROW\nSEX\nAGE\nHEIGHT\nWEIGHT\nfemale\nfem_age\n\n\n\n\n0\n1\nf\n14.3\n56.3\n85.0\n1\n14.3\n\n\n1\n2\nf\n15.5\n62.3\n105.0\n1\n15.5\n\n\n2\n3\nf\n15.3\n63.3\n108.0\n1\n15.3\n\n\n3\n4\nf\n16.1\n59.0\n92.0\n1\n16.1\n\n\n4\n5\nf\n19.1\n62.5\n112.5\n1\n19.1\n\n\n\n\n\n\n\n\n\nRegression Analysis\nNext, we fit a regression model, representing the relationships between gender, age, height and the interaction variable created in the datastep above. We again use a where statement to restrict the analysis to those who are less than or equal to 19 years old. We use the clb option to get a 95% confidence interval for each of the parameters in the model. The model that we are fitting is height = b0 + b1 x female + b2 x age + b3 x fem_age + e\n\nX = htwt[['female', 'AGE', 'fem_age']][htwt['AGE'] <= 19]\nX = sm.add_constant(X)\nY = htwt['HEIGHT'][htwt['AGE'] <= 19]\n\nmodel = sm.OLS(Y, X).fit()\n\nmodel.summary()\n\n\nOLS Regression Results\n\n\nDep. Variable:\nHEIGHT\nR-squared:\n0.460\n\n\nModel:\nOLS\nAdj. R-squared:\n0.452\n\n\nMethod:\nLeast Squares\nF-statistic:\n60.93\n\n\nDate:\nMon, 05 Aug 2024\nProb (F-statistic):\n1.50e-28\n\n\nTime:\n13:32:46\nLog-Likelihood:\n-534.17\n\n\nNo. Observations:\n219\nAIC:\n1076.\n\n\nDf Residuals:\n215\nBIC:\n1090.\n\n\nDf Model:\n3\n\n\n\n\nCovariance Type:\nnonrobust\n\n\n\n\n\n\n\n\n\n\n\ncoef\nstd err\nt\nP>|t|\n[0.025\n0.975]\n\n\nconst\n28.8828\n2.873\n10.052\n0.000\n23.219\n34.547\n\n\nfemale\n13.6123\n4.019\n3.387\n0.001\n5.690\n21.534\n\n\nAGE\n2.0313\n0.178\n11.435\n0.000\n1.681\n2.381\n\n\nfem_age\n-0.9294\n0.248\n-3.750\n0.000\n-1.418\n-0.441\n\n\n\n\n\n\n\n\nOmnibus:\n1.300\nDurbin-Watson:\n2.284\n\n\nProb(Omnibus):\n0.522\nJarque-Bera (JB):\n0.981\n\n\nSkew:\n-0.133\nProb(JB):\n0.612\n\n\nKurtosis:\n3.191\nCond. No.\n450.\n\n\n\nNotes:[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n\n\nFrom the coefficients table b0,b1,b2,b3 are estimated as b0=28.88 b1=13.61 b2=2.03 b3=-0.92942\nThe resulting regression model for height, age and gender based on the available data is height=28.8828 + 13.6123 x female + 2.0313 x age -0.9294 x fem_age"
+    "text": "To demonstrate the use of linear regression we examine a dataset that illustrates the relationship between Height and Weight in a group of 237 teen-aged boys and girls. The dataset is available here and is imported to the workspace.\n\nDescriptive Statistics\nThe first step is to obtain the simple descriptive statistics for the numeric variables of htwt data, and one-way frequencies for categorical variables. This is accomplished by employing summary function. There are 237 participants who are from 13.9 to 25 years old. It is a cross-sectional study, with each participant having one observation. We can use this data set to examine the relationship of participants’ height to their age and sex.\n\nimport pandas as pd\nimport statsmodels.api as sm\n\n# Importing CSV\nhtwt = pd.read_csv(\"../data/htwt.csv\")\n\nIn order to create a regression model to demonstrate the relationship between age and height for females, we first need to create a flag variable identifying females and an interaction variable between age and female gender flag.\n\nhtwt['female'] = (htwt['SEX'] == 'f').astype(int)\nhtwt['fem_age'] = htwt['AGE'] * htwt['female']\nhtwt.head()\n\n\n\n\n\n\n\n\nROW\nSEX\nAGE\nHEIGHT\nWEIGHT\nfemale\nfem_age\n\n\n\n\n0\n1\nf\n14.3\n56.3\n85.0\n1\n14.3\n\n\n1\n2\nf\n15.5\n62.3\n105.0\n1\n15.5\n\n\n2\n3\nf\n15.3\n63.3\n108.0\n1\n15.3\n\n\n3\n4\nf\n16.1\n59.0\n92.0\n1\n16.1\n\n\n4\n5\nf\n19.1\n62.5\n112.5\n1\n19.1\n\n\n\n\n\n\n\n\n\nRegression Analysis\nNext, we fit a regression model, representing the relationships between gender, age, height and the interaction variable created in the datastep above. We again use a where statement to restrict the analysis to those who are less than or equal to 19 years old. We use the clb option to get a 95% confidence interval for each of the parameters in the model. The model that we are fitting is height = b0 + b1 x female + b2 x age + b3 x fem_age + e\n\nX = htwt[['female', 'AGE', 'fem_age']][htwt['AGE'] <= 19]\nX = sm.add_constant(X)\nY = htwt['HEIGHT'][htwt['AGE'] <= 19]\n\nmodel = sm.OLS(Y, X).fit()\n\nmodel.summary()\n\n\nOLS Regression Results\n\n\nDep. Variable:\nHEIGHT\nR-squared:\n0.460\n\n\nModel:\nOLS\nAdj. R-squared:\n0.452\n\n\nMethod:\nLeast Squares\nF-statistic:\n60.93\n\n\nDate:\nMon, 05 Aug 2024\nProb (F-statistic):\n1.50e-28\n\n\nTime:\n13:33:20\nLog-Likelihood:\n-534.17\n\n\nNo. Observations:\n219\nAIC:\n1076.\n\n\nDf Residuals:\n215\nBIC:\n1090.\n\n\nDf Model:\n3\n\n\n\n\nCovariance Type:\nnonrobust\n\n\n\n\n\n\n\n\n\n\n\ncoef\nstd err\nt\nP>|t|\n[0.025\n0.975]\n\n\nconst\n28.8828\n2.873\n10.052\n0.000\n23.219\n34.547\n\n\nfemale\n13.6123\n4.019\n3.387\n0.001\n5.690\n21.534\n\n\nAGE\n2.0313\n0.178\n11.435\n0.000\n1.681\n2.381\n\n\nfem_age\n-0.9294\n0.248\n-3.750\n0.000\n-1.418\n-0.441\n\n\n\n\n\n\n\n\nOmnibus:\n1.300\nDurbin-Watson:\n2.284\n\n\nProb(Omnibus):\n0.522\nJarque-Bera (JB):\n0.981\n\n\nSkew:\n-0.133\nProb(JB):\n0.612\n\n\nKurtosis:\n3.191\nCond. No.\n450.\n\n\n\nNotes:[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n\n\nFrom the coefficients table b0,b1,b2,b3 are estimated as b0=28.88 b1=13.61 b2=2.03 b3=-0.92942\nThe resulting regression model for height, age and gender based on the available data is height=28.8828 + 13.6123 x female + 2.0313 x age -0.9294 x fem_age"
   },
   {
     "objectID": "python/kruskal_wallis.html",
@@ -1124,7 +1124,7 @@
     "href": "R/mi_mar_predictive_mean_match.html#example",
     "title": "Multiple Imputation: Predictive Mean Matching",
     "section": "Example",
-    "text": "Example\nWe use the small dataset nhanes included in mice package. It has 25 rows, and three out of four variables have missings.\nThe original NHANES data is a large national level survey, some are publicly available via R package nhanes.\n\nlibrary(mice)\n\nWarning in check_dep_version(): ABI version mismatch: \nlme4 was built with Matrix ABI version 1\nCurrent Matrix ABI version is 0\nPlease re-install lme4 from source or restore original 'Matrix' package\n\n\n\nAttaching package: 'mice'\n\n\nThe following object is masked from 'package:stats':\n\n    filter\n\n\nThe following objects are masked from 'package:base':\n\n    cbind, rbind\n\n# load example dataset from mice\nhead(nhanes)\n\n  age  bmi hyp chl\n1   1   NA  NA  NA\n2   2 22.7   1 187\n3   1   NA   1 187\n4   3   NA  NA  NA\n5   1 20.4   1 113\n6   3   NA  NA 184\n\nsummary(nhanes)\n\n      age            bmi             hyp             chl       \n Min.   :1.00   Min.   :20.40   Min.   :1.000   Min.   :113.0  \n 1st Qu.:1.00   1st Qu.:22.65   1st Qu.:1.000   1st Qu.:185.0  \n Median :2.00   Median :26.75   Median :1.000   Median :187.0  \n Mean   :1.76   Mean   :26.56   Mean   :1.235   Mean   :191.4  \n 3rd Qu.:2.00   3rd Qu.:28.93   3rd Qu.:1.000   3rd Qu.:212.0  \n Max.   :3.00   Max.   :35.30   Max.   :2.000   Max.   :284.0  \n                NA's   :9       NA's   :8       NA's   :10     \n\n\n\nImpute with PMM\nTo impute with PMM is straightforward: specify the method, method = pmm.\n\nimp_pmm <- mice(nhanes, method = 'pmm', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\n# imputations for bmi\nimp_pmm$imp$bmi\n\n      1    2    3    4    5\n1  30.1 30.1 27.2 28.7 20.4\n3  29.6 33.2 28.7 27.2 26.3\n4  26.3 27.4 22.5 21.7 25.5\n6  20.4 20.4 24.9 25.5 22.5\n10 25.5 24.9 22.7 35.3 22.5\n11 35.3 27.2 33.2 22.0 22.0\n12 25.5 30.1 22.0 22.0 30.1\n16 27.5 26.3 28.7 33.2 26.3\n21 29.6 30.1 27.2 28.7 35.3\n\n\nAn alternative to the standard PMM is midastouch.\n\nimp_pmms <- mice(nhanes, method = 'midastouch', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\nimp_pmms$imp$bmi\n\n      1    2    3    4    5\n1  27.4 35.3 26.3 27.5 29.6\n3  29.6 29.6 26.3 26.3 29.6\n4  25.5 27.2 22.7 24.9 27.4\n6  20.4 21.7 21.7 24.9 27.4\n10 28.7 28.7 28.7 26.3 28.7\n11 30.1 29.6 26.3 27.5 35.3\n12 28.7 28.7 22.7 26.3 28.7\n16 29.6 30.1 26.3 26.3 35.3\n21 29.6 30.1 26.3 29.6 20.4"
+    "text": "Example\nWe use the small dataset nhanes included in mice package. It has 25 rows, and three out of four variables have missings.\nThe original NHANES data is a large national level survey, some are publicly available via R package nhanes.\n\nlibrary(mice)\n\nWarning in check_dep_version(): ABI version mismatch: \nlme4 was built with Matrix ABI version 1\nCurrent Matrix ABI version is 0\nPlease re-install lme4 from source or restore original 'Matrix' package\n\n\n\nAttaching package: 'mice'\n\n\nThe following object is masked from 'package:stats':\n\n    filter\n\n\nThe following objects are masked from 'package:base':\n\n    cbind, rbind\n\n# load example dataset from mice\nhead(nhanes)\n\n  age  bmi hyp chl\n1   1   NA  NA  NA\n2   2 22.7   1 187\n3   1   NA   1 187\n4   3   NA  NA  NA\n5   1 20.4   1 113\n6   3   NA  NA 184\n\nsummary(nhanes)\n\n      age            bmi             hyp             chl       \n Min.   :1.00   Min.   :20.40   Min.   :1.000   Min.   :113.0  \n 1st Qu.:1.00   1st Qu.:22.65   1st Qu.:1.000   1st Qu.:185.0  \n Median :2.00   Median :26.75   Median :1.000   Median :187.0  \n Mean   :1.76   Mean   :26.56   Mean   :1.235   Mean   :191.4  \n 3rd Qu.:2.00   3rd Qu.:28.93   3rd Qu.:1.000   3rd Qu.:212.0  \n Max.   :3.00   Max.   :35.30   Max.   :2.000   Max.   :284.0  \n                NA's   :9       NA's   :8       NA's   :10     \n\n\n\nImpute with PMM\nTo impute with PMM is straightforward: specify the method, method = pmm.\n\nimp_pmm <- mice(nhanes, method = 'pmm', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\n# imputations for bmi\nimp_pmm$imp$bmi\n\n      1    2    3    4    5\n1  22.7 28.7 24.9 22.5 29.6\n3  35.3 30.1 28.7 28.7 28.7\n4  27.4 24.9 25.5 25.5 27.4\n6  20.4 24.9 21.7 24.9 21.7\n10 22.0 20.4 22.5 28.7 27.5\n11 20.4 22.0 27.2 33.2 26.3\n12 27.4 27.5 20.4 24.9 20.4\n16 22.5 27.4 29.6 28.7 24.9\n21 24.9 25.5 21.7 33.2 26.3\n\n\nAn alternative to the standard PMM is midastouch.\n\nimp_pmms <- mice(nhanes, method = 'midastouch', m=5, maxit=10)\n\n\n iter imp variable\n  1   1  bmi  hyp  chl\n  1   2  bmi  hyp  chl\n  1   3  bmi  hyp  chl\n  1   4  bmi  hyp  chl\n  1   5  bmi  hyp  chl\n  2   1  bmi  hyp  chl\n  2   2  bmi  hyp  chl\n  2   3  bmi  hyp  chl\n  2   4  bmi  hyp  chl\n  2   5  bmi  hyp  chl\n  3   1  bmi  hyp  chl\n  3   2  bmi  hyp  chl\n  3   3  bmi  hyp  chl\n  3   4  bmi  hyp  chl\n  3   5  bmi  hyp  chl\n  4   1  bmi  hyp  chl\n  4   2  bmi  hyp  chl\n  4   3  bmi  hyp  chl\n  4   4  bmi  hyp  chl\n  4   5  bmi  hyp  chl\n  5   1  bmi  hyp  chl\n  5   2  bmi  hyp  chl\n  5   3  bmi  hyp  chl\n  5   4  bmi  hyp  chl\n  5   5  bmi  hyp  chl\n  6   1  bmi  hyp  chl\n  6   2  bmi  hyp  chl\n  6   3  bmi  hyp  chl\n  6   4  bmi  hyp  chl\n  6   5  bmi  hyp  chl\n  7   1  bmi  hyp  chl\n  7   2  bmi  hyp  chl\n  7   3  bmi  hyp  chl\n  7   4  bmi  hyp  chl\n  7   5  bmi  hyp  chl\n  8   1  bmi  hyp  chl\n  8   2  bmi  hyp  chl\n  8   3  bmi  hyp  chl\n  8   4  bmi  hyp  chl\n  8   5  bmi  hyp  chl\n  9   1  bmi  hyp  chl\n  9   2  bmi  hyp  chl\n  9   3  bmi  hyp  chl\n  9   4  bmi  hyp  chl\n  9   5  bmi  hyp  chl\n  10   1  bmi  hyp  chl\n  10   2  bmi  hyp  chl\n  10   3  bmi  hyp  chl\n  10   4  bmi  hyp  chl\n  10   5  bmi  hyp  chl\n\nimp_pmm\n\nClass: mids\nNumber of multiple imputations:  5 \nImputation methods:\n  age   bmi   hyp   chl \n   \"\" \"pmm\" \"pmm\" \"pmm\" \nPredictorMatrix:\n    age bmi hyp chl\nage   0   1   1   1\nbmi   1   0   1   1\nhyp   1   1   0   1\nchl   1   1   1   0\n\nimp_pmms$imp$bmi\n\n      1    2    3    4    5\n1  35.3 35.3 20.4 20.4 29.6\n3  30.1 30.1 30.1 29.6 22.0\n4  28.7 25.5 25.5 21.7 21.7\n6  22.5 25.5 21.7 21.7 21.7\n10 28.7 27.5 22.0 21.7 22.5\n11 30.1 35.3 20.4 29.6 29.6\n12 27.4 22.0 28.7 21.7 26.3\n16 33.2 35.3 20.4 29.6 29.6\n21 20.4 33.2 20.4 29.6 29.6"
   },
   {
     "objectID": "R/anova.html",
@@ -1705,7 +1705,7 @@
     "href": "python/logistic_regression.html#statsmodels-package",
     "title": "Logistic Regression",
     "section": "Statsmodels package",
-    "text": "Statsmodels package\nWe will use the sm.Logit() method to fit our logistic regression model.\n\n#intercept column\nx_sm = sm.add_constant(x)\n\n#fit model\nlr_sm = sm.Logit(y, x_sm).fit() \nprint(lr_sm.summary())\n\nOptimization terminated successfully.\n         Current function value: 0.568825\n         Iterations 5\n                           Logit Regression Results                           \n==============================================================================\nDep. Variable:                 wt_grp   No. Observations:                  167\nModel:                          Logit   Df Residuals:                      162\nMethod:                           MLE   Df Model:                            4\nDate:                Mon, 05 Aug 2024   Pseudo R-squ.:                 0.05169\nTime:                        13:32:37   Log-Likelihood:                -94.994\nconverged:                       True   LL-Null:                       -100.17\nCovariance Type:            nonrobust   LLR p-value:                   0.03484\n==============================================================================\n                 coef    std err          z      P>|z|      [0.025      0.975]\n------------------------------------------------------------------------------\nconst          3.3576      1.654      2.029      0.042       0.115       6.600\nage           -0.0126      0.021     -0.598      0.550      -0.054       0.029\nsex           -0.8645      0.371     -2.328      0.020      -1.592      -0.137\nph.ecog        0.4182      0.263      1.592      0.111      -0.097       0.933\nmeal.cal      -0.0009      0.000     -1.932      0.053      -0.002    1.27e-05\n==============================================================================\n\n\n\nModel fitting\nIn addition to the information contained in the summary, we can display the model coefficients as odds ratios:\n\nprint(\"Odds ratios for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.params))\n\nOdds ratios for statsmodels logistic regression:\nconst       28.719651\nage          0.987467\nsex          0.421266\nph.ecog      1.519198\nmeal.cal     0.999140\ndtype: float64\n\n\nWe can also provide the 5% confidence intervals for the odds ratios:\n\nprint(\"CI at 5% for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.conf_int(alpha = 0.05)))\n\nCI at 5% for statsmodels logistic regression:\n                 0           1\nconst     1.121742  735.301118\nage       0.947449    1.029175\nsex       0.203432    0.872354\nph.ecog   0.907984    2.541852\nmeal.cal  0.998269    1.000013\n\n\n\n\nPrediction\nLet’s use our trained model to make a weight loss prediction about a new patient.\n\n# new female, symptomatic but completely ambulatory patient consuming 2500 calories\nnew_pt = pd.DataFrame({\n    \"age\": [56],\n    \"sex\": [2],\n    \"ph.ecog\": [1.00], \n    \"meal.cal\": [2500]\n})\n\n# Add intercept term to the new data; for a single row this should be \n# forced using the `add_constant` command\nnew_pt_sm = sm.add_constant(new_pt, has_constant=\"add\")\nprint(\"Probability of weight loss using the statsmodels package:\")\nprint(lr_sm.predict(new_pt_sm))\n\nProbability of weight loss using the statsmodels package:\n0    0.308057\ndtype: float64"
+    "text": "Statsmodels package\nWe will use the sm.Logit() method to fit our logistic regression model.\n\n#intercept column\nx_sm = sm.add_constant(x)\n\n#fit model\nlr_sm = sm.Logit(y, x_sm).fit() \nprint(lr_sm.summary())\n\nOptimization terminated successfully.\n         Current function value: 0.568825\n         Iterations 5\n                           Logit Regression Results                           \n==============================================================================\nDep. Variable:                 wt_grp   No. Observations:                  167\nModel:                          Logit   Df Residuals:                      162\nMethod:                           MLE   Df Model:                            4\nDate:                Mon, 05 Aug 2024   Pseudo R-squ.:                 0.05169\nTime:                        13:33:11   Log-Likelihood:                -94.994\nconverged:                       True   LL-Null:                       -100.17\nCovariance Type:            nonrobust   LLR p-value:                   0.03484\n==============================================================================\n                 coef    std err          z      P>|z|      [0.025      0.975]\n------------------------------------------------------------------------------\nconst          3.3576      1.654      2.029      0.042       0.115       6.600\nage           -0.0126      0.021     -0.598      0.550      -0.054       0.029\nsex           -0.8645      0.371     -2.328      0.020      -1.592      -0.137\nph.ecog        0.4182      0.263      1.592      0.111      -0.097       0.933\nmeal.cal      -0.0009      0.000     -1.932      0.053      -0.002    1.27e-05\n==============================================================================\n\n\n\nModel fitting\nIn addition to the information contained in the summary, we can display the model coefficients as odds ratios:\n\nprint(\"Odds ratios for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.params))\n\nOdds ratios for statsmodels logistic regression:\nconst       28.719651\nage          0.987467\nsex          0.421266\nph.ecog      1.519198\nmeal.cal     0.999140\ndtype: float64\n\n\nWe can also provide the 5% confidence intervals for the odds ratios:\n\nprint(\"CI at 5% for statsmodels logistic regression:\")\nprint(np.exp(lr_sm.conf_int(alpha = 0.05)))\n\nCI at 5% for statsmodels logistic regression:\n                 0           1\nconst     1.121742  735.301118\nage       0.947449    1.029175\nsex       0.203432    0.872354\nph.ecog   0.907984    2.541852\nmeal.cal  0.998269    1.000013\n\n\n\n\nPrediction\nLet’s use our trained model to make a weight loss prediction about a new patient.\n\n# new female, symptomatic but completely ambulatory patient consuming 2500 calories\nnew_pt = pd.DataFrame({\n    \"age\": [56],\n    \"sex\": [2],\n    \"ph.ecog\": [1.00], \n    \"meal.cal\": [2500]\n})\n\n# Add intercept term to the new data; for a single row this should be \n# forced using the `add_constant` command\nnew_pt_sm = sm.add_constant(new_pt, has_constant=\"add\")\nprint(\"Probability of weight loss using the statsmodels package:\")\nprint(lr_sm.predict(new_pt_sm))\n\nProbability of weight loss using the statsmodels package:\n0    0.308057\ndtype: float64"
   },
   {
     "objectID": "python/logistic_regression.html#scikit-learn-package",
@@ -1747,7 +1747,7 @@
     "href": "python/ancova.html#ancova-in-python",
     "title": "Ancova",
     "section": "Ancova in Python",
-    "text": "Ancova in Python\nIn Python, Ancova can be performed using the statsmodels library from the scipy package.\n\nimport statsmodels.api as sm\nimport statsmodels.formula.api as smf\nfrom tabulate import tabulate\n\n# Fit the ANCOVA model\nmodel_ancova = smf.ols('post ~ drug + pre', data=df).fit()\n\n# Summary of the model\nmodel_summary = model_ancova.summary()\nprint(model_summary)\n\n# Extracting glance (summary) information\nmodel_glance = {\n    'r_squared': model_ancova.rsquared,\n    'adj_r_squared': model_ancova.rsquared_adj,\n    'f_statistic': model_ancova.fvalue,\n    'f_pvalue': model_ancova.f_pvalue,\n    'aic': model_ancova.aic,\n    'bic': model_ancova.bic\n}\nmodel_glance_df = pd.DataFrame([model_glance])\nprint(tabulate(model_glance_df, headers='keys', tablefmt='grid'))\n\n# Extracting tidy (coefficients) information\nmodel_tidy = model_ancova.summary2().tables[1]\nprint(tabulate(model_tidy, headers='keys', tablefmt='grid'))\n\n                            OLS Regression Results                            \n==============================================================================\nDep. Variable:                   post   R-squared:                       0.676\nModel:                            OLS   Adj. R-squared:                  0.639\nMethod:                 Least Squares   F-statistic:                     18.10\nDate:                Mon, 05 Aug 2024   Prob (F-statistic):           1.50e-06\nTime:                        13:32:43   Log-Likelihood:                -82.054\nNo. Observations:                  30   AIC:                             172.1\nDf Residuals:                      26   BIC:                             177.7\nDf Model:                           3                                         \nCovariance Type:            nonrobust                                         \n==============================================================================\n                 coef    std err          t      P>|t|      [0.025      0.975]\n------------------------------------------------------------------------------\nIntercept     -3.8808      1.986     -1.954      0.062      -7.964       0.202\ndrug[T.D]      0.1090      1.795      0.061      0.952      -3.581       3.799\ndrug[T.F]      3.4461      1.887      1.826      0.079      -0.432       7.324\npre            0.9872      0.164      6.001      0.000       0.649       1.325\n==============================================================================\nOmnibus:                        2.609   Durbin-Watson:                   2.526\nProb(Omnibus):                  0.271   Jarque-Bera (JB):                2.148\nSkew:                           0.645   Prob(JB):                        0.342\nKurtosis:                       2.765   Cond. No.                         39.8\n==============================================================================\n\nNotes:\n[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n|    |   r_squared |   adj_r_squared |   f_statistic |    f_pvalue |     aic |     bic |\n+====+=============+=================+===============+=============+=========+=========+\n|  0 |    0.676261 |        0.638906 |       18.1039 | 1.50137e-06 | 172.108 | 177.712 |\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n|           |     Coef. |   Std.Err. |          t |       P>|t| |    [0.025 |   0.975] |\n+===========+===========+============+============+=============+===========+==========+\n| Intercept | -3.88081  |   1.9862   | -1.95388   | 0.0615519   | -7.96351  | 0.201887 |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.D] |  0.108971 |   1.79514  |  0.0607037 | 0.952059    | -3.58098  | 3.79892  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.F] |  3.44614  |   1.88678  |  1.82646   | 0.0792846   | -0.432195 | 7.32447  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| pre       |  0.987184 |   0.164498 |  6.00121   | 2.45433e-06 |  0.649054 | 1.32531  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n\n\nPlease note that all values match with the corresponding R version, except for the AIC and BIC values, which differ slightly. This should be acceptable for most practical purposes in statistical analysis. Currently, there are ongoing discussions in the statsmodels community regarding the computational details of AIC and BIC.\nThe following code can be used to enforce complete consistency of AIC and BIC values with R outputs by adding 1 to the number of parameters:\n\nimport numpy as np\n\n# Manual calculation of AIC and BIC to ensure consistency with R\nn = df.shape[0]  # number of observations\nk = model_ancova.df_model + 1  # number of parameters (including intercept)\nlog_lik = model_ancova.llf  # log-likelihood\n\n# Adjusted number of parameters (including scale parameter)\nk_adjusted = k + 1\n\n# Manually calculate AIC and BIC to match R's behavior\naic_adjusted = 2 * k_adjusted - 2 * log_lik\nbic_adjusted = np.log(n) * k_adjusted - 2 * log_lik\n\nprint(f\"Number of observations (n): {n}\")\nprint(f\"Number of parameters (k_adjusted): {k_adjusted}\")\nprint(f\"Log-likelihood: {log_lik}\")\nprint(f\"AIC (adjusted): {aic_adjusted}\")\nprint(f\"BIC (adjusted): {bic_adjusted}\")\n\nNumber of observations (n): 30\nNumber of parameters (k_adjusted): 5.0\nLog-likelihood: -82.0537744890265\nAIC (adjusted): 174.107548978053\nBIC (adjusted): 181.11353588636376\n\n\nThere are different types of anova computations. The statsmodels.stats.anova.anova_lm function allows the types 1, 2 and 3. The code to compute these types is depicted below:\n\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.anova as ssa\n\n# Center the predictor for Type III anova\n#df['pre_centered'] = df['pre'] - df['pre'].mean()\n\n# Fit the model for types I and II anova\nmodel = smf.ols('post ~ C(drug) + pre', data=df).fit()\n\n# Perform anova for types I and II\nancova_table_type_1 = ssa.anova_lm(model, typ=1)\nancova_table_type_2 = ssa.anova_lm(model, typ=2)\n\n# Fit the model for Type III anova with centered predictors\nmodel_type_3 = smf.ols('post ~ C(drug) + pre', data=df).fit()\nancova_table_type_3 = ssa.anova_lm(model_type_3, typ=3)\n\n# Calculate SSd (sum of squares for residuals)\nssd_type1 = ancova_table_type_1['sum_sq'].loc['Residual']\nssd_type2 = ancova_table_type_2['sum_sq'].loc['Residual']\nssd_type3 = ancova_table_type_3['sum_sq'].loc['Residual']\n\n# Calculate ges\nancova_table_type_1['ges'] = ancova_table_type_1['sum_sq'] / (ancova_table_type_1['sum_sq'] + ssd_type1)\nancova_table_type_2['ges'] = ancova_table_type_2['sum_sq'] / (ancova_table_type_2['sum_sq'] + ssd_type2)\nancova_table_type_3['ges'] = ancova_table_type_3['sum_sq'] / (ancova_table_type_3['sum_sq'] + ssd_type3)\n\n# Add SSd column\nancova_table_type_1['SSd'] = ssd_type1\nancova_table_type_2['SSd'] = ssd_type2\nancova_table_type_3['SSd'] = ssd_type3\n\n# Add significance column\nancova_table_type_1['p<0.05'] = ancova_table_type_1['PR(>F)'] < 0.05\nancova_table_type_2['p<0.05'] = ancova_table_type_2['PR(>F)'] < 0.05\nancova_table_type_3['p<0.05'] = ancova_table_type_3['PR(>F)'] < 0.05\n\n# Rename columns to match the R output\nancova_table_type_1.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_1.reset_index(inplace=True)\nancova_table_type_1.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_2.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_2.reset_index(inplace=True)\nancova_table_type_2.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_3.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_3.reset_index(inplace=True)\nancova_table_type_3.rename(columns={'index': 'Effect'}, inplace=True)\n\n# Calculate DFd (degrees of freedom for residuals)\ndfd_type1 = ancova_table_type_1.loc[ancova_table_type_1['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type2 = ancova_table_type_2.loc[ancova_table_type_2['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type3 = ancova_table_type_3.loc[ancova_table_type_3['Effect'] == 'Residual', 'DFn'].values[0]\nancova_table_type_1['DFd'] = dfd_type1\nancova_table_type_2['DFd'] = dfd_type2\nancova_table_type_3['DFd'] = dfd_type3\n\n# Filter out the Residual row\nancova_table_type_1 = ancova_table_type_1[ancova_table_type_1['Effect'] != 'Residual']\nancova_table_type_2 = ancova_table_type_2[ancova_table_type_2['Effect'] != 'Residual']\nancova_table_type_3 = ancova_table_type_3[ancova_table_type_3['Effect'] != 'Residual']\n\n# Select and reorder columns to match the R output\nancova_table_type_1 = ancova_table_type_1[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_2 = ancova_table_type_2[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_3 = ancova_table_type_3[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]"
+    "text": "Ancova in Python\nIn Python, Ancova can be performed using the statsmodels library from the scipy package.\n\nimport statsmodels.api as sm\nimport statsmodels.formula.api as smf\nfrom tabulate import tabulate\n\n# Fit the ANCOVA model\nmodel_ancova = smf.ols('post ~ drug + pre', data=df).fit()\n\n# Summary of the model\nmodel_summary = model_ancova.summary()\nprint(model_summary)\n\n# Extracting glance (summary) information\nmodel_glance = {\n    'r_squared': model_ancova.rsquared,\n    'adj_r_squared': model_ancova.rsquared_adj,\n    'f_statistic': model_ancova.fvalue,\n    'f_pvalue': model_ancova.f_pvalue,\n    'aic': model_ancova.aic,\n    'bic': model_ancova.bic\n}\nmodel_glance_df = pd.DataFrame([model_glance])\nprint(tabulate(model_glance_df, headers='keys', tablefmt='grid'))\n\n# Extracting tidy (coefficients) information\nmodel_tidy = model_ancova.summary2().tables[1]\nprint(tabulate(model_tidy, headers='keys', tablefmt='grid'))\n\n                            OLS Regression Results                            \n==============================================================================\nDep. Variable:                   post   R-squared:                       0.676\nModel:                            OLS   Adj. R-squared:                  0.639\nMethod:                 Least Squares   F-statistic:                     18.10\nDate:                Mon, 05 Aug 2024   Prob (F-statistic):           1.50e-06\nTime:                        13:33:17   Log-Likelihood:                -82.054\nNo. Observations:                  30   AIC:                             172.1\nDf Residuals:                      26   BIC:                             177.7\nDf Model:                           3                                         \nCovariance Type:            nonrobust                                         \n==============================================================================\n                 coef    std err          t      P>|t|      [0.025      0.975]\n------------------------------------------------------------------------------\nIntercept     -3.8808      1.986     -1.954      0.062      -7.964       0.202\ndrug[T.D]      0.1090      1.795      0.061      0.952      -3.581       3.799\ndrug[T.F]      3.4461      1.887      1.826      0.079      -0.432       7.324\npre            0.9872      0.164      6.001      0.000       0.649       1.325\n==============================================================================\nOmnibus:                        2.609   Durbin-Watson:                   2.526\nProb(Omnibus):                  0.271   Jarque-Bera (JB):                2.148\nSkew:                           0.645   Prob(JB):                        0.342\nKurtosis:                       2.765   Cond. No.                         39.8\n==============================================================================\n\nNotes:\n[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n|    |   r_squared |   adj_r_squared |   f_statistic |    f_pvalue |     aic |     bic |\n+====+=============+=================+===============+=============+=========+=========+\n|  0 |    0.676261 |        0.638906 |       18.1039 | 1.50137e-06 | 172.108 | 177.712 |\n+----+-------------+-----------------+---------------+-------------+---------+---------+\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n|           |     Coef. |   Std.Err. |          t |       P>|t| |    [0.025 |   0.975] |\n+===========+===========+============+============+=============+===========+==========+\n| Intercept | -3.88081  |   1.9862   | -1.95388   | 0.0615519   | -7.96351  | 0.201887 |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.D] |  0.108971 |   1.79514  |  0.0607037 | 0.952059    | -3.58098  | 3.79892  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| drug[T.F] |  3.44614  |   1.88678  |  1.82646   | 0.0792846   | -0.432195 | 7.32447  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n| pre       |  0.987184 |   0.164498 |  6.00121   | 2.45433e-06 |  0.649054 | 1.32531  |\n+-----------+-----------+------------+------------+-------------+-----------+----------+\n\n\nPlease note that all values match with the corresponding R version, except for the AIC and BIC values, which differ slightly. This should be acceptable for most practical purposes in statistical analysis. Currently, there are ongoing discussions in the statsmodels community regarding the computational details of AIC and BIC.\nThe following code can be used to enforce complete consistency of AIC and BIC values with R outputs by adding 1 to the number of parameters:\n\nimport numpy as np\n\n# Manual calculation of AIC and BIC to ensure consistency with R\nn = df.shape[0]  # number of observations\nk = model_ancova.df_model + 1  # number of parameters (including intercept)\nlog_lik = model_ancova.llf  # log-likelihood\n\n# Adjusted number of parameters (including scale parameter)\nk_adjusted = k + 1\n\n# Manually calculate AIC and BIC to match R's behavior\naic_adjusted = 2 * k_adjusted - 2 * log_lik\nbic_adjusted = np.log(n) * k_adjusted - 2 * log_lik\n\nprint(f\"Number of observations (n): {n}\")\nprint(f\"Number of parameters (k_adjusted): {k_adjusted}\")\nprint(f\"Log-likelihood: {log_lik}\")\nprint(f\"AIC (adjusted): {aic_adjusted}\")\nprint(f\"BIC (adjusted): {bic_adjusted}\")\n\nNumber of observations (n): 30\nNumber of parameters (k_adjusted): 5.0\nLog-likelihood: -82.0537744890265\nAIC (adjusted): 174.107548978053\nBIC (adjusted): 181.11353588636376\n\n\nThere are different types of anova computations. The statsmodels.stats.anova.anova_lm function allows the types 1, 2 and 3. The code to compute these types is depicted below:\n\nimport statsmodels.formula.api as smf\nimport statsmodels.stats.anova as ssa\n\n# Center the predictor for Type III anova\n#df['pre_centered'] = df['pre'] - df['pre'].mean()\n\n# Fit the model for types I and II anova\nmodel = smf.ols('post ~ C(drug) + pre', data=df).fit()\n\n# Perform anova for types I and II\nancova_table_type_1 = ssa.anova_lm(model, typ=1)\nancova_table_type_2 = ssa.anova_lm(model, typ=2)\n\n# Fit the model for Type III anova with centered predictors\nmodel_type_3 = smf.ols('post ~ C(drug) + pre', data=df).fit()\nancova_table_type_3 = ssa.anova_lm(model_type_3, typ=3)\n\n# Calculate SSd (sum of squares for residuals)\nssd_type1 = ancova_table_type_1['sum_sq'].loc['Residual']\nssd_type2 = ancova_table_type_2['sum_sq'].loc['Residual']\nssd_type3 = ancova_table_type_3['sum_sq'].loc['Residual']\n\n# Calculate ges\nancova_table_type_1['ges'] = ancova_table_type_1['sum_sq'] / (ancova_table_type_1['sum_sq'] + ssd_type1)\nancova_table_type_2['ges'] = ancova_table_type_2['sum_sq'] / (ancova_table_type_2['sum_sq'] + ssd_type2)\nancova_table_type_3['ges'] = ancova_table_type_3['sum_sq'] / (ancova_table_type_3['sum_sq'] + ssd_type3)\n\n# Add SSd column\nancova_table_type_1['SSd'] = ssd_type1\nancova_table_type_2['SSd'] = ssd_type2\nancova_table_type_3['SSd'] = ssd_type3\n\n# Add significance column\nancova_table_type_1['p<0.05'] = ancova_table_type_1['PR(>F)'] < 0.05\nancova_table_type_2['p<0.05'] = ancova_table_type_2['PR(>F)'] < 0.05\nancova_table_type_3['p<0.05'] = ancova_table_type_3['PR(>F)'] < 0.05\n\n# Rename columns to match the R output\nancova_table_type_1.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_1.reset_index(inplace=True)\nancova_table_type_1.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_2.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_2.reset_index(inplace=True)\nancova_table_type_2.rename(columns={'index': 'Effect'}, inplace=True)\n\nancova_table_type_3.rename(columns={'sum_sq': 'SSn', 'df': 'DFn', 'F': 'F', 'PR(>F)': 'p'}, inplace=True)\nancova_table_type_3.reset_index(inplace=True)\nancova_table_type_3.rename(columns={'index': 'Effect'}, inplace=True)\n\n# Calculate DFd (degrees of freedom for residuals)\ndfd_type1 = ancova_table_type_1.loc[ancova_table_type_1['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type2 = ancova_table_type_2.loc[ancova_table_type_2['Effect'] == 'Residual', 'DFn'].values[0]\ndfd_type3 = ancova_table_type_3.loc[ancova_table_type_3['Effect'] == 'Residual', 'DFn'].values[0]\nancova_table_type_1['DFd'] = dfd_type1\nancova_table_type_2['DFd'] = dfd_type2\nancova_table_type_3['DFd'] = dfd_type3\n\n# Filter out the Residual row\nancova_table_type_1 = ancova_table_type_1[ancova_table_type_1['Effect'] != 'Residual']\nancova_table_type_2 = ancova_table_type_2[ancova_table_type_2['Effect'] != 'Residual']\nancova_table_type_3 = ancova_table_type_3[ancova_table_type_3['Effect'] != 'Residual']\n\n# Select and reorder columns to match the R output\nancova_table_type_1 = ancova_table_type_1[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_2 = ancova_table_type_2[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]\nancova_table_type_3 = ancova_table_type_3[['Effect', 'DFn', 'DFd', 'SSn', 'SSd', 'F', 'p', 'p<0.05', 'ges']]"
   },
   {
     "objectID": "python/ancova.html#type-1-ancova-in-python",