Configuration file
MARVELous uses a configuration file to specify all input paths, options, and analysis settings. This notebook illustrates the tools available to create and parse it.
The configuration file is a flat text file with section headers enclosed in square brackets, e.g. [GenoInput]. Each line beneath a header defines one option as a left-hand side (LHS) key and a right-hand side (RHS) value. Unrecognised options are ignored.
Create and parse a configuration file
To create a configuration file, use the provided create_config function — it validates inputs before writing. When the file is used to run the pipeline, it is parsed by ConfigParser, which performs additional checks and returns a configuration object that MARVELousPipeline consumes directly.
[ ]:
import os
import re
import glob
import shutil
import tempfile
import pandas as pd
from marvel.utils.config_tools import create_config, ConfigParser, check_config_file
from marvel.pipeline import MARVELousPipeline
from marvel.example_data import examples
from marvel.constants import (
CONFIG_HEADERS as CFNames,
CONFIG_LHS as CFLhs,
)
# Trigger generation and get the directory containing all example data files
input_dir = os.path.dirname(examples.get_path('vcf_example', 'example_genotypes_chr1.vcf.gz'))
Setup: Define input files and tests
[2]:
# Extract variants from vcf-files
geno_files = glob.glob(os.path.join(input_dir, 'example*vcf.gz'))
geno_input = {
re.search(r'chr\d+', geno_files[i], re.IGNORECASE).group(0):
geno_files[i] for i in range(len(geno_files))
}
# Define phenotype and covariates file
pheno_file = os.path.join(input_dir, 'phenotypes.tsv')
cov_file = os.path.join(input_dir, 'covariates.tsv')
# Tests to be performed per outcome
con_tests = {
'sbp' : ['OLS', 'KW'],
'ldl_cholesterol' : ['OLS', 'AOV'],
'crp_level' : ['OLS', 'KW', 'AOV'],
}
cat_tests = {
'disease_severity' : ['CHISQ'],
'treatment_response' : ['CHISQ'],
'risk_category' : ['CHISQ'],
}
bin_tests = {
'hypertension' : ['GLM-Binom', 'CHISQ'],
'diabetes' : ['GLM-Binom', 'FISHER'],
'cvd' : ['GLM-Binom', 'CHISQ', 'FISHER'],
}
# Create a temporary output directory for this example
output_dir = tempfile.mkdtemp(prefix='marvel_example_')
Create configuration file
We’ll create a configuration file with all the necessary options for variant extraction and association testing.
[3]:
# Create configuration file
config_file = os.path.join(output_dir, 'example.cnf')
create_config(
path = config_file,
extract_variants = True,
geno_input = geno_input,
variant_input = {'variants' : os.path.join(input_dir, 'gene_variants.tsv')},
variant_output = {CFLhs.VarOutput : os.path.join(output_dir, 'variant_carriers')},
association_analysis = True,
pheno_input = {CFLhs.PhenoFile : pheno_file, CFLhs.CovFile : cov_file},
con_tests = con_tests,
cat_tests = cat_tests,
bin_tests = bin_tests,
covs = {'Unadjusted' : 'None', 'Adjusted' : ['age', 'sex']},
id_column = 'id',
raise_on_error = True,
ref_column = 'a1',
alt_column = 'a2',
cat_column = 'gene',
incl_var = False,
output_path = output_dir,
)
Parse and validate configuration file
The configuration file is parsed into a dictionary and validated to ensure all required files exist and options are valid.
[4]:
# Parse the configuration file
config = ConfigParser(config_file)
# Validate the configuration
check_config_file(config)
# Display the configuration
config
[4]:
{'GenoInput': {'chr22': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr22.vcf.gz',
'chr16': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr16.vcf.gz',
'chr1': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr1.vcf.gz',
'chr7': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr7.vcf.gz',
'chr5': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr5.vcf.gz'},
'VarInput': {'variants': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/gene_variants.tsv'},
'Output': {'VarOutput': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v/variant_carriers'},
'ExpInput': {'VarOutput': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v/variant_carriers'},
'PhenoInput': {'phenotypes': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/phenotypes.tsv',
'covariates': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/covariates.tsv'},
'ConTests': {'sbp': 'OLS;KW',
'ldl_cholesterol': 'OLS;AOV',
'crp_level': 'OLS;KW;AOV'},
'CatTests': {'disease_severity': 'CHISQ',
'treatment_response': 'CHISQ',
'risk_category': 'CHISQ'},
'BinTests': {'hypertension': 'GLM-Binom;CHISQ',
'diabetes': 'GLM-Binom;FISHER',
'cvd': 'GLM-Binom;CHISQ;FISHER'},
'SurvTests': {},
'Covs': {'Unadjusted': None, 'Adjusted': 'age;sex'},
'Stratification': {},
'Options': {'id_column': 'id',
'raise_on_error': True,
'ref_column': 'a1',
'alt_column': 'a2',
'cat_column': 'gene',
'incl_var': False,
'output_path': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v',
'extract_variants': True,
'association_analysis': True}}
Run the complete pipeline
The MARVELousPipeline class provides a complete pipeline for variant extraction and association testing. It will:
Extract variants from VCF files
Aggregate variants to genes (if
cat_columnis specified)Perform association tests for all specified outcomes and models
[ ]:
# Create and run the pipeline
pipeline = MARVELousPipeline.from_config_file(config_file, verbose=True)
results = pipeline.run() # prints a lot of warnings
# print the summary
results
Examine the results
Let’s look at the extracted gene carriers and association testing results.
[6]:
# Load the gene carriers file
carriers_file = os.path.join(output_dir, 'variant_carriers_carriers.tsv.gz')
carriers = pd.read_csv(carriers_file, sep='\t')
carriers
[6]:
| id | PCSK9 | ANGPTL3 | HMGCR | NPC1L1 | CETP | PPARA | |
|---|---|---|---|---|---|---|---|
| 0 | ID_000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | ID_000001 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | ID_000002 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | ID_000003 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | ID_000004 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | ID_000995 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 996 | ID_000996 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 997 | ID_000997 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 998 | ID_000998 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 999 | ID_000999 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1000 rows × 7 columns
[7]:
# Find association results files
results_files = glob.glob(os.path.join(output_dir, '*_results.tsv.gz'))
example_results = pd.read_csv(results_files[0], sep='\t')
example_results
[7]:
| Model | Model name | Stratification name | Variable | Exposure | Cases | Samples | Exposed | Non-exposed | Point Estimates | Standard Error | Test statistic | P-values | Formatted | Formatted (exp) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | OLS | Unadjusted | overall | sbp | Intercept | 0.0 | 595.0 | 40.0 | 554.0 | 120.302849 | 0.650156 | NaN | 0.000000e+00 | 120.30 (119.03; 121.58) | 1765482086878253772055506936203298189383022575... |
| 1 | OLS | Unadjusted | overall | sbp | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | -1.017021 | 2.390836 | NaN | 6.707119e-01 | -1.02 (-5.70; 3.67) | 0.36 (0.00; 39.21) |
| 2 | KW | Unadjusted | overall | sbp | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.729107 | 6.945068e-01 | NaN | NaN |
| 3 | OLS | Unadjusted | overall | ldl_cholesterol | Intercept | 0.0 | 595.0 | 40.0 | 554.0 | 5.120017 | 0.047318 | NaN | 0.000000e+00 | 5.12 (5.03; 5.21) | 167.34 (152.52; 183.60) |
| 4 | OLS | Unadjusted | overall | ldl_cholesterol | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | 0.157997 | 0.174005 | NaN | 3.642451e-01 | 0.16 (-0.18; 0.50) | 1.17 (0.83; 1.65) |
| 5 | AOV | Unadjusted | overall | ldl_cholesterol | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.830076 | 4.365229e-01 | NaN | NaN |
| 6 | OLS | Unadjusted | overall | crp_level | Intercept | 0.0 | 595.0 | 40.0 | 554.0 | 37.760859 | 0.850326 | NaN | 9.706392e-191 | 37.76 (36.09; 39.43) | 25080298728020192.00 (4737333186671159.00; 132... |
| 7 | OLS | Unadjusted | overall | crp_level | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | -0.690148 | 3.126926 | NaN | 8.253932e-01 | -0.69 (-6.82; 5.44) | 0.50 (0.00; 230.10) |
| 8 | KW | Unadjusted | overall | crp_level | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 3.398355 | 1.828339e-01 | NaN | NaN |
| 9 | AOV | Unadjusted | overall | crp_level | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.521440 | 5.939377e-01 | NaN | NaN |
| 10 | CHISQ | Unadjusted | overall | disease_severity | PPARA | NaN | NaN | NaN | NaN | NaN | NaN | 1.349363 | 8.529465e-01 | NaN | NaN |
| 11 | CHISQ | Unadjusted | overall | treatment_response | PPARA | NaN | NaN | NaN | NaN | NaN | NaN | 5.692997 | 2.232786e-01 | NaN | NaN |
| 12 | CHISQ | Unadjusted | overall | risk_category | PPARA | NaN | NaN | NaN | NaN | NaN | NaN | 2.151600 | 9.052412e-01 | NaN | NaN |
| 13 | GLM-Binom | Unadjusted | overall | hypertension | Intercept | 167.0 | 595.0 | 40.0 | 554.0 | -0.968590 | 0.095037 | NaN | 2.159747e-24 | -0.97 (-1.15; -0.78) | 0.38 (0.32; 0.46) |
| 14 | GLM-Binom | Unadjusted | overall | hypertension | PPARA | 167.0 | 595.0 | 40.0 | 554.0 | 0.362809 | 0.327072 | NaN | 2.673155e-01 | 0.36 (-0.28; 1.00) | 1.44 (0.76; 2.73) |
| 15 | CHISQ | Unadjusted | overall | hypertension | PPARA | 167.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 2.262074 | 3.226984e-01 | NaN | NaN |
| 16 | GLM-Binom | Unadjusted | overall | diabetes | Intercept | 90.0 | 595.0 | 40.0 | 554.0 | -1.693308 | 0.117248 | NaN | 2.810478e-47 | -1.69 (-1.92; -1.46) | 0.18 (0.15; 0.23) |
| 17 | GLM-Binom | Unadjusted | overall | diabetes | PPARA | 90.0 | 595.0 | 40.0 | 554.0 | -0.537188 | 0.530297 | NaN | 3.110627e-01 | -0.54 (-1.58; 0.50) | 0.58 (0.21; 1.65) |
| 18 | FISHER | Unadjusted | overall | diabetes | PPARA | 90.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.109807 | 5.690000e-01 | NaN | NaN |
| 19 | GLM-Binom | Unadjusted | overall | cvd | Intercept | 272.0 | 595.0 | 40.0 | 554.0 | -0.163532 | 0.085188 | NaN | 5.490186e-02 | -0.16 (-0.33; 0.00) | 0.85 (0.72; 1.00) |
| 20 | GLM-Binom | Unadjusted | overall | cvd | PPARA | 272.0 | 595.0 | 40.0 | 554.0 | -0.118555 | 0.315469 | NaN | 7.070608e-01 | -0.12 (-0.74; 0.50) | 0.89 (0.48; 1.65) |
| 21 | CHISQ | Unadjusted | overall | cvd | PPARA | 272.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 1.735910 | 4.198092e-01 | NaN | NaN |
| 22 | FISHER | Unadjusted | overall | cvd | PPARA | 272.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.045933 | 4.146000e-01 | NaN | NaN |
| 23 | OLS | Adjusted | overall | sbp | Intercept | 0.0 | 595.0 | 40.0 | 554.0 | 117.703262 | 2.516429 | NaN | 8.164671e-201 | 117.70 (112.77; 122.64) | 1311828317954960423126233004197845153998444657... |
| 24 | OLS | Adjusted | overall | sbp | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | -0.903887 | 2.391981 | NaN | 7.056537e-01 | -0.90 (-5.59; 3.78) | 0.40 (0.00; 44.01) |
| 25 | OLS | Adjusted | overall | sbp | age | 0.0 | 595.0 | 40.0 | 554.0 | 0.027950 | 0.042458 | NaN | 5.106003e-01 | 0.03 (-0.06; 0.11) | 1.03 (0.95; 1.12) |
| 26 | OLS | Adjusted | overall | sbp | sex | 0.0 | 595.0 | 40.0 | 554.0 | 1.944960 | 1.259648 | NaN | 1.231116e-01 | 1.94 (-0.52; 4.41) | 6.99 (0.59; 82.58) |
| 27 | KW | Adjusted | overall | sbp | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.729107 | 6.945068e-01 | NaN | NaN |
| 28 | OLS | Adjusted | overall | ldl_cholesterol | Intercept | 0.0 | 595.0 | 40.0 | 554.0 | 4.962306 | 0.183446 | NaN | 1.792348e-105 | 4.96 (4.60; 5.32) | 142.92 (99.76; 204.76) |
| 29 | OLS | Adjusted | overall | ldl_cholesterol | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | 0.165743 | 0.174374 | NaN | 3.422464e-01 | 0.17 (-0.18; 0.51) | 1.18 (0.84; 1.66) |
| 30 | OLS | Adjusted | overall | ldl_cholesterol | age | 0.0 | 595.0 | 40.0 | 554.0 | 0.002927 | 0.003095 | NaN | 3.446396e-01 | 0.00 (-0.00; 0.01) | 1.00 (1.00; 1.01) |
| 31 | OLS | Adjusted | overall | ldl_cholesterol | sex | 0.0 | 595.0 | 40.0 | 554.0 | -0.006788 | 0.091828 | NaN | 9.410981e-01 | -0.01 (-0.19; 0.17) | 0.99 (0.83; 1.19) |
| 32 | AOV | Adjusted | overall | ldl_cholesterol | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.830076 | 4.365229e-01 | NaN | NaN |
| 33 | OLS | Adjusted | overall | crp_level | Intercept | 0.0 | 595.0 | 40.0 | 554.0 | 40.210821 | 3.296347 | NaN | 1.099366e-30 | 40.21 (33.75; 46.67) | 290628165776811520.00 (454446183387761.50; 185... |
| 34 | OLS | Adjusted | overall | crp_level | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | -0.800260 | 3.133328 | NaN | 7.985010e-01 | -0.80 (-6.94; 5.34) | 0.45 (0.00; 208.71) |
| 35 | OLS | Adjusted | overall | crp_level | age | 0.0 | 595.0 | 40.0 | 554.0 | -0.031217 | 0.055617 | NaN | 5.748179e-01 | -0.03 (-0.14; 0.08) | 0.97 (0.87; 1.08) |
| 36 | OLS | Adjusted | overall | crp_level | sex | 0.0 | 595.0 | 40.0 | 554.0 | -1.339086 | 1.650052 | NaN | 4.173813e-01 | -1.34 (-4.57; 1.89) | 0.26 (0.01; 6.65) |
| 37 | KW | Adjusted | overall | crp_level | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 3.398355 | 1.828339e-01 | NaN | NaN |
| 38 | AOV | Adjusted | overall | crp_level | PPARA | 0.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.521440 | 5.939377e-01 | NaN | NaN |
| 39 | CHISQ | Adjusted | overall | disease_severity | PPARA | NaN | NaN | NaN | NaN | NaN | NaN | 1.349363 | 8.529465e-01 | NaN | NaN |
| 40 | CHISQ | Adjusted | overall | treatment_response | PPARA | NaN | NaN | NaN | NaN | NaN | NaN | 5.692997 | 2.232786e-01 | NaN | NaN |
| 41 | CHISQ | Adjusted | overall | risk_category | PPARA | NaN | NaN | NaN | NaN | NaN | NaN | 2.151600 | 9.052412e-01 | NaN | NaN |
| 42 | GLM-Binom | Adjusted | overall | hypertension | Intercept | 167.0 | 595.0 | 40.0 | 554.0 | -0.658206 | 0.362441 | NaN | 6.936485e-02 | -0.66 (-1.37; 0.05) | 0.52 (0.25; 1.05) |
| 43 | GLM-Binom | Adjusted | overall | hypertension | PPARA | 167.0 | 595.0 | 40.0 | 554.0 | 0.349150 | 0.327753 | NaN | 2.867482e-01 | 0.35 (-0.29; 0.99) | 1.42 (0.75; 2.70) |
| 44 | GLM-Binom | Adjusted | overall | hypertension | age | 167.0 | 595.0 | 40.0 | 554.0 | -0.004817 | 0.006168 | NaN | 4.348695e-01 | -0.00 (-0.02; 0.01) | 1.00 (0.98; 1.01) |
| 45 | GLM-Binom | Adjusted | overall | hypertension | sex | 167.0 | 595.0 | 40.0 | 554.0 | -0.085265 | 0.183228 | NaN | 6.416821e-01 | -0.09 (-0.44; 0.27) | 0.92 (0.64; 1.32) |
| 46 | CHISQ | Adjusted | overall | hypertension | PPARA | 167.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 2.262074 | 3.226984e-01 | NaN | NaN |
| 47 | GLM-Binom | Adjusted | overall | diabetes | Intercept | 90.0 | 595.0 | 40.0 | 554.0 | -1.810679 | 0.463750 | NaN | 9.444704e-05 | -1.81 (-2.72; -0.90) | 0.16 (0.07; 0.41) |
| 48 | GLM-Binom | Adjusted | overall | diabetes | PPARA | 90.0 | 595.0 | 40.0 | 554.0 | -0.531163 | 0.530278 | NaN | 3.165039e-01 | -0.53 (-1.57; 0.51) | 0.59 (0.21; 1.66) |
| 49 | GLM-Binom | Adjusted | overall | diabetes | age | 90.0 | 595.0 | 40.0 | 554.0 | 0.001017 | 0.007785 | NaN | 8.960965e-01 | 0.00 (-0.01; 0.02) | 1.00 (0.99; 1.02) |
| 50 | GLM-Binom | Adjusted | overall | diabetes | sex | 90.0 | 595.0 | 40.0 | 554.0 | 0.110459 | 0.230917 | NaN | 6.324014e-01 | 0.11 (-0.34; 0.56) | 1.12 (0.71; 1.76) |
| 51 | FISHER | Adjusted | overall | diabetes | PPARA | 90.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.109807 | 5.709000e-01 | NaN | NaN |
| 52 | GLM-Binom | Adjusted | overall | cvd | Intercept | 272.0 | 595.0 | 40.0 | 554.0 | -0.242553 | 0.330850 | NaN | 4.634856e-01 | -0.24 (-0.89; 0.41) | 0.78 (0.41; 1.50) |
| 53 | GLM-Binom | Adjusted | overall | cvd | PPARA | 272.0 | 595.0 | 40.0 | 554.0 | -0.116603 | 0.316153 | NaN | 7.122634e-01 | -0.12 (-0.74; 0.50) | 0.89 (0.48; 1.65) |
| 54 | GLM-Binom | Adjusted | overall | cvd | age | 272.0 | 595.0 | 40.0 | 554.0 | -0.000826 | 0.005581 | NaN | 8.823172e-01 | -0.00 (-0.01; 0.01) | 1.00 (0.99; 1.01) |
| 55 | GLM-Binom | Adjusted | overall | cvd | sex | 272.0 | 595.0 | 40.0 | 554.0 | 0.227868 | 0.165660 | NaN | 1.689712e-01 | 0.23 (-0.10; 0.55) | 1.26 (0.91; 1.74) |
| 56 | CHISQ | Adjusted | overall | cvd | PPARA | 272.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 1.735910 | 4.198092e-01 | NaN | NaN |
| 57 | FISHER | Adjusted | overall | cvd | PPARA | 272.0 | 595.0 | 40.0 | 554.0 | NaN | NaN | 0.045933 | 4.175000e-01 | NaN | NaN |
Stratified pipeline run
The association pipeline can be run separately for each defined subgroup (stratum). This is configured via the stratify parameter in create_config, which maps stratum names to filter strings in column=value format.
By default (stratify_overall=True), an overall (unstratified) run is also performed alongside each stratum.
Stratification columns must be present in phenotypes.tsv. Here we stratify by hypertension (0.0 = no hypertension, 1.0 = hypertension), a binary outcome already in the phenotype file. The pipeline compares values via .astype(str), so filter strings use '0.0' and '1.0'.
[ ]:
import logging
# Use a fresh temporary directory for the stratified run
tmp_strat = tempfile.mkdtemp(prefix='marvel_strat_')
config_strat = os.path.join(tmp_strat, 'example_strat.cnf')
# Stratification uses columns from phenotypes.tsv.
# 'hypertension' is a binary outcome (0.0 / 1.0 float64) in phenotypes.tsv.
# The pipeline compares via .astype(str), so filter strings use '0.0' and '1.0'.
create_config(
path=config_strat,
extract_variants=True,
geno_input=geno_input,
variant_input={'variants': os.path.join(input_dir, 'gene_variants.tsv')},
variant_output={CFLhs.VarOutput: os.path.join(tmp_strat, 'variant_carriers')},
association_analysis=True,
pheno_input={CFLhs.PhenoFile: pheno_file, CFLhs.CovFile: cov_file},
con_tests=con_tests,
cat_tests=cat_tests,
bin_tests=bin_tests,
covs={'Unadjusted': 'None', 'Adjusted': ['age', 'sex']},
stratify={
'hypertension_0': 'hypertension=0.0',
'hypertension_1': 'hypertension=1.0',
},
id_column='id',
raise_on_error=True,
ref_column='a1',
alt_column='a2',
cat_column='gene',
incl_var=False,
output_path=tmp_strat,
)
# Suppress INFO logging to keep committed cell output deterministic
logging.disable(logging.INFO)
try:
strat_pipeline = MARVELousPipeline.from_config_file(config_strat, verbose=False)
strat_pipeline.run()
finally:
logging.disable(logging.NOTSET)
# Verify output: list all relative paths under tmp_strat
all_paths = sorted(
os.path.relpath(p, tmp_strat)
for p in glob.glob(os.path.join(tmp_strat, '**'), recursive=True)
if os.path.isfile(p)
)
# Display output structure (relative paths only — no temp dir prefix)
print("Stratified run output files:")
for p in all_paths:
print(" ", p)
# Assert both strata produced output
assert any('hypertension_0' in p for p in all_paths), "hypertension_0 stratum output not found"
assert any('hypertension_1' in p for p in all_paths), "hypertension_1 stratum output not found"
print("\nAssertions passed: hypertension_0 and hypertension_1 strata both produced output.")
Cleanup
Remove temporary files created during this example.
[9]:
# Uncomment the following lines to remove the temporary output directories
shutil.rmtree(output_dir)
shutil.rmtree(tmp_strat)