Configuration file

MARVELous uses a configuration file to specify all input paths, options, and analysis settings. This notebook illustrates the tools available to create and parse it.

The configuration file is a flat text file with section headers enclosed in square brackets, e.g. [GenoInput]. Each line beneath a header defines one option as a left-hand side (LHS) key and a right-hand side (RHS) value. Unrecognised options are ignored.

Create and parse a configuration file

To create a configuration file, use the provided create_config function — it validates inputs before writing. When the file is used to run the pipeline, it is parsed by ConfigParser, which performs additional checks and returns a configuration object that MARVELousPipeline consumes directly.

[ ]:
import os
import re
import glob
import shutil
import tempfile
import pandas as pd
from marvel.utils.config_tools import create_config, ConfigParser, check_config_file
from marvel.pipeline import MARVELousPipeline
from marvel.example_data import examples
from marvel.constants import (
    CONFIG_HEADERS as CFNames,
    CONFIG_LHS as CFLhs,
)

# Trigger generation and get the directory containing all example data files
input_dir = os.path.dirname(examples.get_path('vcf_example', 'example_genotypes_chr1.vcf.gz'))

Setup: Define input files and tests

[2]:
# Extract variants from vcf-files
geno_files = glob.glob(os.path.join(input_dir, 'example*vcf.gz'))
geno_input = {
    re.search(r'chr\d+', geno_files[i], re.IGNORECASE).group(0):
    geno_files[i] for i in range(len(geno_files))
}

# Define phenotype and covariates file
pheno_file = os.path.join(input_dir, 'phenotypes.tsv')
cov_file = os.path.join(input_dir, 'covariates.tsv')

# Tests to be performed per outcome
con_tests = {
    'sbp'             : ['OLS', 'KW'],
    'ldl_cholesterol' : ['OLS', 'AOV'],
    'crp_level'       : ['OLS', 'KW', 'AOV'],
}
cat_tests = {
    'disease_severity'   : ['CHISQ'],
    'treatment_response' : ['CHISQ'],
    'risk_category'      : ['CHISQ'],
}
bin_tests = {
    'hypertension' : ['GLM-Binom', 'CHISQ'],
    'diabetes'     : ['GLM-Binom', 'FISHER'],
    'cvd'          : ['GLM-Binom', 'CHISQ', 'FISHER'],
}

# Create a temporary output directory for this example
output_dir = tempfile.mkdtemp(prefix='marvel_example_')

Create configuration file

We’ll create a configuration file with all the necessary options for variant extraction and association testing.

[3]:
# Create configuration file
config_file = os.path.join(output_dir, 'example.cnf')

create_config(
    path = config_file,
    extract_variants = True,
    geno_input = geno_input,
    variant_input = {'variants' : os.path.join(input_dir, 'gene_variants.tsv')},
    variant_output = {CFLhs.VarOutput : os.path.join(output_dir, 'variant_carriers')},
    association_analysis = True,
    pheno_input = {CFLhs.PhenoFile : pheno_file, CFLhs.CovFile : cov_file},
    con_tests = con_tests,
    cat_tests = cat_tests,
    bin_tests = bin_tests,
    covs = {'Unadjusted' : 'None', 'Adjusted' : ['age', 'sex']},
    id_column = 'id',
    raise_on_error = True,
    ref_column = 'a1',
    alt_column = 'a2',
    cat_column = 'gene',
    incl_var = False,
    output_path = output_dir,
)

Parse and validate configuration file

The configuration file is parsed into a dictionary and validated to ensure all required files exist and options are valid.

[4]:
# Parse the configuration file
config = ConfigParser(config_file)

# Validate the configuration
check_config_file(config)

# Display the configuration
config
[4]:
{'GenoInput': {'chr22': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr22.vcf.gz',
  'chr16': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr16.vcf.gz',
  'chr1': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr1.vcf.gz',
  'chr7': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr7.vcf.gz',
  'chr5': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr5.vcf.gz'},
 'VarInput': {'variants': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/gene_variants.tsv'},
 'Output': {'VarOutput': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v/variant_carriers'},
 'ExpInput': {'VarOutput': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v/variant_carriers'},
 'PhenoInput': {'phenotypes': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/phenotypes.tsv',
  'covariates': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/covariates.tsv'},
 'ConTests': {'sbp': 'OLS;KW',
  'ldl_cholesterol': 'OLS;AOV',
  'crp_level': 'OLS;KW;AOV'},
 'CatTests': {'disease_severity': 'CHISQ',
  'treatment_response': 'CHISQ',
  'risk_category': 'CHISQ'},
 'BinTests': {'hypertension': 'GLM-Binom;CHISQ',
  'diabetes': 'GLM-Binom;FISHER',
  'cvd': 'GLM-Binom;CHISQ;FISHER'},
 'SurvTests': {},
 'Covs': {'Unadjusted': None, 'Adjusted': 'age;sex'},
 'Stratification': {},
 'Options': {'id_column': 'id',
  'raise_on_error': True,
  'ref_column': 'a1',
  'alt_column': 'a2',
  'cat_column': 'gene',
  'incl_var': False,
  'output_path': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v',
  'extract_variants': True,
  'association_analysis': True}}

Run the complete pipeline

The MARVELousPipeline class provides a complete pipeline for variant extraction and association testing. It will:

  1. Extract variants from VCF files

  2. Aggregate variants to genes (if cat_column is specified)

  3. Perform association tests for all specified outcomes and models

[ ]:
# Create and run the pipeline
pipeline = MARVELousPipeline.from_config_file(config_file, verbose=True)
results = pipeline.run() # prints a lot of warnings

# print the summary
results

Examine the results

Let’s look at the extracted gene carriers and association testing results.

[6]:
# Load the gene carriers file
carriers_file = os.path.join(output_dir, 'variant_carriers_carriers.tsv.gz')
carriers = pd.read_csv(carriers_file, sep='\t')
carriers
[6]:
id PCSK9 ANGPTL3 HMGCR NPC1L1 CETP PPARA
0 ID_000000 0.0 0.0 0.0 0.0 0.0 0.0
1 ID_000001 0.0 0.0 0.0 0.0 0.0 0.0
2 ID_000002 0.0 0.0 0.0 0.0 0.0 0.0
3 ID_000003 0.0 0.0 0.0 0.0 0.0 0.0
4 ID_000004 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ...
995 ID_000995 0.0 1.0 0.0 0.0 0.0 0.0
996 ID_000996 0.0 0.0 0.0 0.0 0.0 0.0
997 ID_000997 0.0 0.0 0.0 0.0 0.0 0.0
998 ID_000998 0.0 0.0 0.0 0.0 0.0 0.0
999 ID_000999 0.0 0.0 0.0 0.0 0.0 0.0

1000 rows × 7 columns

[7]:
# Find association results files
results_files = glob.glob(os.path.join(output_dir, '*_results.tsv.gz'))
example_results = pd.read_csv(results_files[0], sep='\t')
example_results
[7]:
Model Model name Stratification name Variable Exposure Cases Samples Exposed Non-exposed Point Estimates Standard Error Test statistic P-values Formatted Formatted (exp)
0 OLS Unadjusted overall sbp Intercept 0.0 595.0 40.0 554.0 120.302849 0.650156 NaN 0.000000e+00 120.30 (119.03; 121.58) 1765482086878253772055506936203298189383022575...
1 OLS Unadjusted overall sbp PPARA 0.0 595.0 40.0 554.0 -1.017021 2.390836 NaN 6.707119e-01 -1.02 (-5.70; 3.67) 0.36 (0.00; 39.21)
2 KW Unadjusted overall sbp PPARA 0.0 595.0 40.0 554.0 NaN NaN 0.729107 6.945068e-01 NaN NaN
3 OLS Unadjusted overall ldl_cholesterol Intercept 0.0 595.0 40.0 554.0 5.120017 0.047318 NaN 0.000000e+00 5.12 (5.03; 5.21) 167.34 (152.52; 183.60)
4 OLS Unadjusted overall ldl_cholesterol PPARA 0.0 595.0 40.0 554.0 0.157997 0.174005 NaN 3.642451e-01 0.16 (-0.18; 0.50) 1.17 (0.83; 1.65)
5 AOV Unadjusted overall ldl_cholesterol PPARA 0.0 595.0 40.0 554.0 NaN NaN 0.830076 4.365229e-01 NaN NaN
6 OLS Unadjusted overall crp_level Intercept 0.0 595.0 40.0 554.0 37.760859 0.850326 NaN 9.706392e-191 37.76 (36.09; 39.43) 25080298728020192.00 (4737333186671159.00; 132...
7 OLS Unadjusted overall crp_level PPARA 0.0 595.0 40.0 554.0 -0.690148 3.126926 NaN 8.253932e-01 -0.69 (-6.82; 5.44) 0.50 (0.00; 230.10)
8 KW Unadjusted overall crp_level PPARA 0.0 595.0 40.0 554.0 NaN NaN 3.398355 1.828339e-01 NaN NaN
9 AOV Unadjusted overall crp_level PPARA 0.0 595.0 40.0 554.0 NaN NaN 0.521440 5.939377e-01 NaN NaN
10 CHISQ Unadjusted overall disease_severity PPARA NaN NaN NaN NaN NaN NaN 1.349363 8.529465e-01 NaN NaN
11 CHISQ Unadjusted overall treatment_response PPARA NaN NaN NaN NaN NaN NaN 5.692997 2.232786e-01 NaN NaN
12 CHISQ Unadjusted overall risk_category PPARA NaN NaN NaN NaN NaN NaN 2.151600 9.052412e-01 NaN NaN
13 GLM-Binom Unadjusted overall hypertension Intercept 167.0 595.0 40.0 554.0 -0.968590 0.095037 NaN 2.159747e-24 -0.97 (-1.15; -0.78) 0.38 (0.32; 0.46)
14 GLM-Binom Unadjusted overall hypertension PPARA 167.0 595.0 40.0 554.0 0.362809 0.327072 NaN 2.673155e-01 0.36 (-0.28; 1.00) 1.44 (0.76; 2.73)
15 CHISQ Unadjusted overall hypertension PPARA 167.0 595.0 40.0 554.0 NaN NaN 2.262074 3.226984e-01 NaN NaN
16 GLM-Binom Unadjusted overall diabetes Intercept 90.0 595.0 40.0 554.0 -1.693308 0.117248 NaN 2.810478e-47 -1.69 (-1.92; -1.46) 0.18 (0.15; 0.23)
17 GLM-Binom Unadjusted overall diabetes PPARA 90.0 595.0 40.0 554.0 -0.537188 0.530297 NaN 3.110627e-01 -0.54 (-1.58; 0.50) 0.58 (0.21; 1.65)
18 FISHER Unadjusted overall diabetes PPARA 90.0 595.0 40.0 554.0 NaN NaN 0.109807 5.690000e-01 NaN NaN
19 GLM-Binom Unadjusted overall cvd Intercept 272.0 595.0 40.0 554.0 -0.163532 0.085188 NaN 5.490186e-02 -0.16 (-0.33; 0.00) 0.85 (0.72; 1.00)
20 GLM-Binom Unadjusted overall cvd PPARA 272.0 595.0 40.0 554.0 -0.118555 0.315469 NaN 7.070608e-01 -0.12 (-0.74; 0.50) 0.89 (0.48; 1.65)
21 CHISQ Unadjusted overall cvd PPARA 272.0 595.0 40.0 554.0 NaN NaN 1.735910 4.198092e-01 NaN NaN
22 FISHER Unadjusted overall cvd PPARA 272.0 595.0 40.0 554.0 NaN NaN 0.045933 4.146000e-01 NaN NaN
23 OLS Adjusted overall sbp Intercept 0.0 595.0 40.0 554.0 117.703262 2.516429 NaN 8.164671e-201 117.70 (112.77; 122.64) 1311828317954960423126233004197845153998444657...
24 OLS Adjusted overall sbp PPARA 0.0 595.0 40.0 554.0 -0.903887 2.391981 NaN 7.056537e-01 -0.90 (-5.59; 3.78) 0.40 (0.00; 44.01)
25 OLS Adjusted overall sbp age 0.0 595.0 40.0 554.0 0.027950 0.042458 NaN 5.106003e-01 0.03 (-0.06; 0.11) 1.03 (0.95; 1.12)
26 OLS Adjusted overall sbp sex 0.0 595.0 40.0 554.0 1.944960 1.259648 NaN 1.231116e-01 1.94 (-0.52; 4.41) 6.99 (0.59; 82.58)
27 KW Adjusted overall sbp PPARA 0.0 595.0 40.0 554.0 NaN NaN 0.729107 6.945068e-01 NaN NaN
28 OLS Adjusted overall ldl_cholesterol Intercept 0.0 595.0 40.0 554.0 4.962306 0.183446 NaN 1.792348e-105 4.96 (4.60; 5.32) 142.92 (99.76; 204.76)
29 OLS Adjusted overall ldl_cholesterol PPARA 0.0 595.0 40.0 554.0 0.165743 0.174374 NaN 3.422464e-01 0.17 (-0.18; 0.51) 1.18 (0.84; 1.66)
30 OLS Adjusted overall ldl_cholesterol age 0.0 595.0 40.0 554.0 0.002927 0.003095 NaN 3.446396e-01 0.00 (-0.00; 0.01) 1.00 (1.00; 1.01)
31 OLS Adjusted overall ldl_cholesterol sex 0.0 595.0 40.0 554.0 -0.006788 0.091828 NaN 9.410981e-01 -0.01 (-0.19; 0.17) 0.99 (0.83; 1.19)
32 AOV Adjusted overall ldl_cholesterol PPARA 0.0 595.0 40.0 554.0 NaN NaN 0.830076 4.365229e-01 NaN NaN
33 OLS Adjusted overall crp_level Intercept 0.0 595.0 40.0 554.0 40.210821 3.296347 NaN 1.099366e-30 40.21 (33.75; 46.67) 290628165776811520.00 (454446183387761.50; 185...
34 OLS Adjusted overall crp_level PPARA 0.0 595.0 40.0 554.0 -0.800260 3.133328 NaN 7.985010e-01 -0.80 (-6.94; 5.34) 0.45 (0.00; 208.71)
35 OLS Adjusted overall crp_level age 0.0 595.0 40.0 554.0 -0.031217 0.055617 NaN 5.748179e-01 -0.03 (-0.14; 0.08) 0.97 (0.87; 1.08)
36 OLS Adjusted overall crp_level sex 0.0 595.0 40.0 554.0 -1.339086 1.650052 NaN 4.173813e-01 -1.34 (-4.57; 1.89) 0.26 (0.01; 6.65)
37 KW Adjusted overall crp_level PPARA 0.0 595.0 40.0 554.0 NaN NaN 3.398355 1.828339e-01 NaN NaN
38 AOV Adjusted overall crp_level PPARA 0.0 595.0 40.0 554.0 NaN NaN 0.521440 5.939377e-01 NaN NaN
39 CHISQ Adjusted overall disease_severity PPARA NaN NaN NaN NaN NaN NaN 1.349363 8.529465e-01 NaN NaN
40 CHISQ Adjusted overall treatment_response PPARA NaN NaN NaN NaN NaN NaN 5.692997 2.232786e-01 NaN NaN
41 CHISQ Adjusted overall risk_category PPARA NaN NaN NaN NaN NaN NaN 2.151600 9.052412e-01 NaN NaN
42 GLM-Binom Adjusted overall hypertension Intercept 167.0 595.0 40.0 554.0 -0.658206 0.362441 NaN 6.936485e-02 -0.66 (-1.37; 0.05) 0.52 (0.25; 1.05)
43 GLM-Binom Adjusted overall hypertension PPARA 167.0 595.0 40.0 554.0 0.349150 0.327753 NaN 2.867482e-01 0.35 (-0.29; 0.99) 1.42 (0.75; 2.70)
44 GLM-Binom Adjusted overall hypertension age 167.0 595.0 40.0 554.0 -0.004817 0.006168 NaN 4.348695e-01 -0.00 (-0.02; 0.01) 1.00 (0.98; 1.01)
45 GLM-Binom Adjusted overall hypertension sex 167.0 595.0 40.0 554.0 -0.085265 0.183228 NaN 6.416821e-01 -0.09 (-0.44; 0.27) 0.92 (0.64; 1.32)
46 CHISQ Adjusted overall hypertension PPARA 167.0 595.0 40.0 554.0 NaN NaN 2.262074 3.226984e-01 NaN NaN
47 GLM-Binom Adjusted overall diabetes Intercept 90.0 595.0 40.0 554.0 -1.810679 0.463750 NaN 9.444704e-05 -1.81 (-2.72; -0.90) 0.16 (0.07; 0.41)
48 GLM-Binom Adjusted overall diabetes PPARA 90.0 595.0 40.0 554.0 -0.531163 0.530278 NaN 3.165039e-01 -0.53 (-1.57; 0.51) 0.59 (0.21; 1.66)
49 GLM-Binom Adjusted overall diabetes age 90.0 595.0 40.0 554.0 0.001017 0.007785 NaN 8.960965e-01 0.00 (-0.01; 0.02) 1.00 (0.99; 1.02)
50 GLM-Binom Adjusted overall diabetes sex 90.0 595.0 40.0 554.0 0.110459 0.230917 NaN 6.324014e-01 0.11 (-0.34; 0.56) 1.12 (0.71; 1.76)
51 FISHER Adjusted overall diabetes PPARA 90.0 595.0 40.0 554.0 NaN NaN 0.109807 5.709000e-01 NaN NaN
52 GLM-Binom Adjusted overall cvd Intercept 272.0 595.0 40.0 554.0 -0.242553 0.330850 NaN 4.634856e-01 -0.24 (-0.89; 0.41) 0.78 (0.41; 1.50)
53 GLM-Binom Adjusted overall cvd PPARA 272.0 595.0 40.0 554.0 -0.116603 0.316153 NaN 7.122634e-01 -0.12 (-0.74; 0.50) 0.89 (0.48; 1.65)
54 GLM-Binom Adjusted overall cvd age 272.0 595.0 40.0 554.0 -0.000826 0.005581 NaN 8.823172e-01 -0.00 (-0.01; 0.01) 1.00 (0.99; 1.01)
55 GLM-Binom Adjusted overall cvd sex 272.0 595.0 40.0 554.0 0.227868 0.165660 NaN 1.689712e-01 0.23 (-0.10; 0.55) 1.26 (0.91; 1.74)
56 CHISQ Adjusted overall cvd PPARA 272.0 595.0 40.0 554.0 NaN NaN 1.735910 4.198092e-01 NaN NaN
57 FISHER Adjusted overall cvd PPARA 272.0 595.0 40.0 554.0 NaN NaN 0.045933 4.175000e-01 NaN NaN

Stratified pipeline run

The association pipeline can be run separately for each defined subgroup (stratum). This is configured via the stratify parameter in create_config, which maps stratum names to filter strings in column=value format.

By default (stratify_overall=True), an overall (unstratified) run is also performed alongside each stratum.

Stratification columns must be present in phenotypes.tsv. Here we stratify by hypertension (0.0 = no hypertension, 1.0 = hypertension), a binary outcome already in the phenotype file. The pipeline compares values via .astype(str), so filter strings use '0.0' and '1.0'.

[ ]:
import logging

# Use a fresh temporary directory for the stratified run
tmp_strat = tempfile.mkdtemp(prefix='marvel_strat_')
config_strat = os.path.join(tmp_strat, 'example_strat.cnf')

# Stratification uses columns from phenotypes.tsv.
# 'hypertension' is a binary outcome (0.0 / 1.0 float64) in phenotypes.tsv.
# The pipeline compares via .astype(str), so filter strings use '0.0' and '1.0'.
create_config(
    path=config_strat,
    extract_variants=True,
    geno_input=geno_input,
    variant_input={'variants': os.path.join(input_dir, 'gene_variants.tsv')},
    variant_output={CFLhs.VarOutput: os.path.join(tmp_strat, 'variant_carriers')},
    association_analysis=True,
    pheno_input={CFLhs.PhenoFile: pheno_file, CFLhs.CovFile: cov_file},
    con_tests=con_tests,
    cat_tests=cat_tests,
    bin_tests=bin_tests,
    covs={'Unadjusted': 'None', 'Adjusted': ['age', 'sex']},
    stratify={
        'hypertension_0': 'hypertension=0.0',
        'hypertension_1': 'hypertension=1.0',
    },
    id_column='id',
    raise_on_error=True,
    ref_column='a1',
    alt_column='a2',
    cat_column='gene',
    incl_var=False,
    output_path=tmp_strat,
)

# Suppress INFO logging to keep committed cell output deterministic
logging.disable(logging.INFO)
try:
    strat_pipeline = MARVELousPipeline.from_config_file(config_strat, verbose=False)
    strat_pipeline.run()
finally:
    logging.disable(logging.NOTSET)

# Verify output: list all relative paths under tmp_strat
all_paths = sorted(
    os.path.relpath(p, tmp_strat)
    for p in glob.glob(os.path.join(tmp_strat, '**'), recursive=True)
    if os.path.isfile(p)
)

# Display output structure (relative paths only — no temp dir prefix)
print("Stratified run output files:")
for p in all_paths:
    print(" ", p)

# Assert both strata produced output
assert any('hypertension_0' in p for p in all_paths), "hypertension_0 stratum output not found"
assert any('hypertension_1' in p for p in all_paths), "hypertension_1 stratum output not found"
print("\nAssertions passed: hypertension_0 and hypertension_1 strata both produced output.")

Cleanup

Remove temporary files created during this example.

[9]:
# Uncomment the following lines to remove the temporary output directories
shutil.rmtree(output_dir)
shutil.rmtree(tmp_strat)