Configuration file

MARVELous uses a configuration file to specify all input paths, options, and analysis settings. This notebook illustrates the tools available to create and parse it.

The configuration file is a flat text file with section headers enclosed in square brackets, e.g. [GenoInput]. Each line beneath a header defines one option as a left-hand side (LHS) key and a right-hand side (RHS) value. Unrecognised options are ignored.

Create and parse a configuration file

To create a configuration file, use the provided create_config function — it validates inputs before writing. When the file is used to run the pipeline, it is parsed by ConfigParser, which performs additional checks and returns a configuration object that MARVELousPipeline consumes directly.

[ ]:

import os
import re
import glob
import shutil
import tempfile
import pandas as pd
from marvel.utils.config_tools import create_config, ConfigParser, check_config_file
from marvel.pipeline import MARVELousPipeline
from marvel.example_data import examples
from marvel.constants import (
    CONFIG_HEADERS as CFNames,
    CONFIG_LHS as CFLhs,
)

# Trigger generation and get the directory containing all example data files
input_dir = os.path.dirname(examples.get_path('vcf_example', 'example_genotypes_chr1.vcf.gz'))

Setup: Define input files and tests

[2]:

# Extract variants from vcf-files
geno_files = glob.glob(os.path.join(input_dir, 'example*vcf.gz'))
geno_input = {
    re.search(r'chr\d+', geno_files[i], re.IGNORECASE).group(0):
    geno_files[i] for i in range(len(geno_files))
}

# Define phenotype and covariates file
pheno_file = os.path.join(input_dir, 'phenotypes.tsv')
cov_file = os.path.join(input_dir, 'covariates.tsv')

# Tests to be performed per outcome
con_tests = {
    'sbp'             : ['OLS', 'KW'],
    'ldl_cholesterol' : ['OLS', 'AOV'],
    'crp_level'       : ['OLS', 'KW', 'AOV'],
}
cat_tests = {
    'disease_severity'   : ['CHISQ'],
    'treatment_response' : ['CHISQ'],
    'risk_category'      : ['CHISQ'],
}
bin_tests = {
    'hypertension' : ['GLM-Binom', 'CHISQ'],
    'diabetes'     : ['GLM-Binom', 'FISHER'],
    'cvd'          : ['GLM-Binom', 'CHISQ', 'FISHER'],
}

# Create a temporary output directory for this example
output_dir = tempfile.mkdtemp(prefix='marvel_example_')

Create configuration file

We’ll create a configuration file with all the necessary options for variant extraction and association testing.

[3]:

# Create configuration file
config_file = os.path.join(output_dir, 'example.cnf')

create_config(
    path = config_file,
    extract_variants = True,
    geno_input = geno_input,
    variant_input = {'variants' : os.path.join(input_dir, 'gene_variants.tsv')},
    variant_output = {CFLhs.VarOutput : os.path.join(output_dir, 'variant_carriers')},
    association_analysis = True,
    pheno_input = {CFLhs.PhenoFile : pheno_file, CFLhs.CovFile : cov_file},
    con_tests = con_tests,
    cat_tests = cat_tests,
    bin_tests = bin_tests,
    covs = {'Unadjusted' : 'None', 'Adjusted' : ['age', 'sex']},
    id_column = 'id',
    raise_on_error = True,
    ref_column = 'a1',
    alt_column = 'a2',
    cat_column = 'gene',
    incl_var = False,
    output_path = output_dir,
)

Parse and validate configuration file

The configuration file is parsed into a dictionary and validated to ensure all required files exist and options are valid.

[4]:

# Parse the configuration file
config = ConfigParser(config_file)

# Validate the configuration
check_config_file(config)

# Display the configuration
config

[4]:

{'GenoInput': {'chr22': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr22.vcf.gz',
  'chr16': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr16.vcf.gz',
  'chr1': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr1.vcf.gz',
  'chr7': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr7.vcf.gz',
  'chr5': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/example_genotypes_chr5.vcf.gz'},
 'VarInput': {'variants': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/gene_variants.tsv'},
 'Output': {'VarOutput': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v/variant_carriers'},
 'ExpInput': {'VarOutput': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v/variant_carriers'},
 'PhenoInput': {'phenotypes': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/phenotypes.tsv',
  'covariates': '/Users/marionvanvugt/Documents/work.nosync/projects/marvel/marvel/example_data/example_datasets/covariates.tsv'},
 'ConTests': {'sbp': 'OLS;KW',
  'ldl_cholesterol': 'OLS;AOV',
  'crp_level': 'OLS;KW;AOV'},
 'CatTests': {'disease_severity': 'CHISQ',
  'treatment_response': 'CHISQ',
  'risk_category': 'CHISQ'},
 'BinTests': {'hypertension': 'GLM-Binom;CHISQ',
  'diabetes': 'GLM-Binom;FISHER',
  'cvd': 'GLM-Binom;CHISQ;FISHER'},
 'SurvTests': {},
 'Covs': {'Unadjusted': None, 'Adjusted': 'age;sex'},
 'Stratification': {},
 'Options': {'id_column': 'id',
  'raise_on_error': True,
  'ref_column': 'a1',
  'alt_column': 'a2',
  'cat_column': 'gene',
  'incl_var': False,
  'output_path': '/var/folders/pn/y5_78t7d3zsbvv25wz7s5xsr0000gn/T/marvel_example_n49pr04v',
  'extract_variants': True,
  'association_analysis': True}}

Run the complete pipeline

The MARVELousPipeline class provides a complete pipeline for variant extraction and association testing. It will:

Extract variants from VCF files
Aggregate variants to genes (if cat_column is specified)
Perform association tests for all specified outcomes and models

[ ]:

# Create and run the pipeline
pipeline = MARVELousPipeline.from_config_file(config_file, verbose=True)
results = pipeline.run() # prints a lot of warnings

# print the summary
results

Examine the results

Let’s look at the extracted gene carriers and association testing results.

[6]:

# Load the gene carriers file
carriers_file = os.path.join(output_dir, 'variant_carriers_carriers.tsv.gz')
carriers = pd.read_csv(carriers_file, sep='\t')
carriers

[6]:

	id	PCSK9	ANGPTL3	HMGCR	NPC1L1	CETP	PPARA
0	ID_000000	0.0	0.0	0.0	0.0	0.0	0.0
1	ID_000001	0.0	0.0	0.0	0.0	0.0	0.0
2	ID_000002	0.0	0.0	0.0	0.0	0.0	0.0
3	ID_000003	0.0	0.0	0.0	0.0	0.0	0.0
4	ID_000004	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...
995	ID_000995	0.0	1.0	0.0	0.0	0.0	0.0
996	ID_000996	0.0	0.0	0.0	0.0	0.0	0.0
997	ID_000997	0.0	0.0	0.0	0.0	0.0	0.0
998	ID_000998	0.0	0.0	0.0	0.0	0.0	0.0
999	ID_000999	0.0	0.0	0.0	0.0	0.0	0.0

1000 rows × 7 columns

[7]:

# Find association results files
results_files = glob.glob(os.path.join(output_dir, '*_results.tsv.gz'))
example_results = pd.read_csv(results_files[0], sep='\t')
example_results

[7]:

	Model	Model name	Stratification name	Variable	Exposure	Cases	Samples	Exposed	Non-exposed	Point Estimates	Standard Error	Test statistic	P-values	Formatted	Formatted (exp)
0	OLS	Unadjusted	overall	sbp	Intercept	0.0	595.0	40.0	554.0	120.302849	0.650156	NaN	0.000000e+00	120.30 (119.03; 121.58)	1765482086878253772055506936203298189383022575...
1	OLS	Unadjusted	overall	sbp	PPARA	0.0	595.0	40.0	554.0	-1.017021	2.390836	NaN	6.707119e-01	-1.02 (-5.70; 3.67)	0.36 (0.00; 39.21)
2	KW	Unadjusted	overall	sbp	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	0.729107	6.945068e-01	NaN	NaN
3	OLS	Unadjusted	overall	ldl_cholesterol	Intercept	0.0	595.0	40.0	554.0	5.120017	0.047318	NaN	0.000000e+00	5.12 (5.03; 5.21)	167.34 (152.52; 183.60)
4	OLS	Unadjusted	overall	ldl_cholesterol	PPARA	0.0	595.0	40.0	554.0	0.157997	0.174005	NaN	3.642451e-01	0.16 (-0.18; 0.50)	1.17 (0.83; 1.65)
5	AOV	Unadjusted	overall	ldl_cholesterol	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	0.830076	4.365229e-01	NaN	NaN
6	OLS	Unadjusted	overall	crp_level	Intercept	0.0	595.0	40.0	554.0	37.760859	0.850326	NaN	9.706392e-191	37.76 (36.09; 39.43)	25080298728020192.00 (4737333186671159.00; 132...
7	OLS	Unadjusted	overall	crp_level	PPARA	0.0	595.0	40.0	554.0	-0.690148	3.126926	NaN	8.253932e-01	-0.69 (-6.82; 5.44)	0.50 (0.00; 230.10)
8	KW	Unadjusted	overall	crp_level	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	3.398355	1.828339e-01	NaN	NaN
9	AOV	Unadjusted	overall	crp_level	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	0.521440	5.939377e-01	NaN	NaN
10	CHISQ	Unadjusted	overall	disease_severity	PPARA	NaN	NaN	NaN	NaN	NaN	NaN	1.349363	8.529465e-01	NaN	NaN
11	CHISQ	Unadjusted	overall	treatment_response	PPARA	NaN	NaN	NaN	NaN	NaN	NaN	5.692997	2.232786e-01	NaN	NaN
12	CHISQ	Unadjusted	overall	risk_category	PPARA	NaN	NaN	NaN	NaN	NaN	NaN	2.151600	9.052412e-01	NaN	NaN
13	GLM-Binom	Unadjusted	overall	hypertension	Intercept	167.0	595.0	40.0	554.0	-0.968590	0.095037	NaN	2.159747e-24	-0.97 (-1.15; -0.78)	0.38 (0.32; 0.46)
14	GLM-Binom	Unadjusted	overall	hypertension	PPARA	167.0	595.0	40.0	554.0	0.362809	0.327072	NaN	2.673155e-01	0.36 (-0.28; 1.00)	1.44 (0.76; 2.73)
15	CHISQ	Unadjusted	overall	hypertension	PPARA	167.0	595.0	40.0	554.0	NaN	NaN	2.262074	3.226984e-01	NaN	NaN
16	GLM-Binom	Unadjusted	overall	diabetes	Intercept	90.0	595.0	40.0	554.0	-1.693308	0.117248	NaN	2.810478e-47	-1.69 (-1.92; -1.46)	0.18 (0.15; 0.23)
17	GLM-Binom	Unadjusted	overall	diabetes	PPARA	90.0	595.0	40.0	554.0	-0.537188	0.530297	NaN	3.110627e-01	-0.54 (-1.58; 0.50)	0.58 (0.21; 1.65)
18	FISHER	Unadjusted	overall	diabetes	PPARA	90.0	595.0	40.0	554.0	NaN	NaN	0.109807	5.690000e-01	NaN	NaN
19	GLM-Binom	Unadjusted	overall	cvd	Intercept	272.0	595.0	40.0	554.0	-0.163532	0.085188	NaN	5.490186e-02	-0.16 (-0.33; 0.00)	0.85 (0.72; 1.00)
20	GLM-Binom	Unadjusted	overall	cvd	PPARA	272.0	595.0	40.0	554.0	-0.118555	0.315469	NaN	7.070608e-01	-0.12 (-0.74; 0.50)	0.89 (0.48; 1.65)
21	CHISQ	Unadjusted	overall	cvd	PPARA	272.0	595.0	40.0	554.0	NaN	NaN	1.735910	4.198092e-01	NaN	NaN
22	FISHER	Unadjusted	overall	cvd	PPARA	272.0	595.0	40.0	554.0	NaN	NaN	0.045933	4.146000e-01	NaN	NaN
23	OLS	Adjusted	overall	sbp	Intercept	0.0	595.0	40.0	554.0	117.703262	2.516429	NaN	8.164671e-201	117.70 (112.77; 122.64)	1311828317954960423126233004197845153998444657...
24	OLS	Adjusted	overall	sbp	PPARA	0.0	595.0	40.0	554.0	-0.903887	2.391981	NaN	7.056537e-01	-0.90 (-5.59; 3.78)	0.40 (0.00; 44.01)
25	OLS	Adjusted	overall	sbp	age	0.0	595.0	40.0	554.0	0.027950	0.042458	NaN	5.106003e-01	0.03 (-0.06; 0.11)	1.03 (0.95; 1.12)
26	OLS	Adjusted	overall	sbp	sex	0.0	595.0	40.0	554.0	1.944960	1.259648	NaN	1.231116e-01	1.94 (-0.52; 4.41)	6.99 (0.59; 82.58)
27	KW	Adjusted	overall	sbp	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	0.729107	6.945068e-01	NaN	NaN
28	OLS	Adjusted	overall	ldl_cholesterol	Intercept	0.0	595.0	40.0	554.0	4.962306	0.183446	NaN	1.792348e-105	4.96 (4.60; 5.32)	142.92 (99.76; 204.76)
29	OLS	Adjusted	overall	ldl_cholesterol	PPARA	0.0	595.0	40.0	554.0	0.165743	0.174374	NaN	3.422464e-01	0.17 (-0.18; 0.51)	1.18 (0.84; 1.66)
30	OLS	Adjusted	overall	ldl_cholesterol	age	0.0	595.0	40.0	554.0	0.002927	0.003095	NaN	3.446396e-01	0.00 (-0.00; 0.01)	1.00 (1.00; 1.01)
31	OLS	Adjusted	overall	ldl_cholesterol	sex	0.0	595.0	40.0	554.0	-0.006788	0.091828	NaN	9.410981e-01	-0.01 (-0.19; 0.17)	0.99 (0.83; 1.19)
32	AOV	Adjusted	overall	ldl_cholesterol	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	0.830076	4.365229e-01	NaN	NaN
33	OLS	Adjusted	overall	crp_level	Intercept	0.0	595.0	40.0	554.0	40.210821	3.296347	NaN	1.099366e-30	40.21 (33.75; 46.67)	290628165776811520.00 (454446183387761.50; 185...
34	OLS	Adjusted	overall	crp_level	PPARA	0.0	595.0	40.0	554.0	-0.800260	3.133328	NaN	7.985010e-01	-0.80 (-6.94; 5.34)	0.45 (0.00; 208.71)
35	OLS	Adjusted	overall	crp_level	age	0.0	595.0	40.0	554.0	-0.031217	0.055617	NaN	5.748179e-01	-0.03 (-0.14; 0.08)	0.97 (0.87; 1.08)
36	OLS	Adjusted	overall	crp_level	sex	0.0	595.0	40.0	554.0	-1.339086	1.650052	NaN	4.173813e-01	-1.34 (-4.57; 1.89)	0.26 (0.01; 6.65)
37	KW	Adjusted	overall	crp_level	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	3.398355	1.828339e-01	NaN	NaN
38	AOV	Adjusted	overall	crp_level	PPARA	0.0	595.0	40.0	554.0	NaN	NaN	0.521440	5.939377e-01	NaN	NaN
39	CHISQ	Adjusted	overall	disease_severity	PPARA	NaN	NaN	NaN	NaN	NaN	NaN	1.349363	8.529465e-01	NaN	NaN
40	CHISQ	Adjusted	overall	treatment_response	PPARA	NaN	NaN	NaN	NaN	NaN	NaN	5.692997	2.232786e-01	NaN	NaN
41	CHISQ	Adjusted	overall	risk_category	PPARA	NaN	NaN	NaN	NaN	NaN	NaN	2.151600	9.052412e-01	NaN	NaN
42	GLM-Binom	Adjusted	overall	hypertension	Intercept	167.0	595.0	40.0	554.0	-0.658206	0.362441	NaN	6.936485e-02	-0.66 (-1.37; 0.05)	0.52 (0.25; 1.05)
43	GLM-Binom	Adjusted	overall	hypertension	PPARA	167.0	595.0	40.0	554.0	0.349150	0.327753	NaN	2.867482e-01	0.35 (-0.29; 0.99)	1.42 (0.75; 2.70)
44	GLM-Binom	Adjusted	overall	hypertension	age	167.0	595.0	40.0	554.0	-0.004817	0.006168	NaN	4.348695e-01	-0.00 (-0.02; 0.01)	1.00 (0.98; 1.01)
45	GLM-Binom	Adjusted	overall	hypertension	sex	167.0	595.0	40.0	554.0	-0.085265	0.183228	NaN	6.416821e-01	-0.09 (-0.44; 0.27)	0.92 (0.64; 1.32)
46	CHISQ	Adjusted	overall	hypertension	PPARA	167.0	595.0	40.0	554.0	NaN	NaN	2.262074	3.226984e-01	NaN	NaN
47	GLM-Binom	Adjusted	overall	diabetes	Intercept	90.0	595.0	40.0	554.0	-1.810679	0.463750	NaN	9.444704e-05	-1.81 (-2.72; -0.90)	0.16 (0.07; 0.41)
48	GLM-Binom	Adjusted	overall	diabetes	PPARA	90.0	595.0	40.0	554.0	-0.531163	0.530278	NaN	3.165039e-01	-0.53 (-1.57; 0.51)	0.59 (0.21; 1.66)
49	GLM-Binom	Adjusted	overall	diabetes	age	90.0	595.0	40.0	554.0	0.001017	0.007785	NaN	8.960965e-01	0.00 (-0.01; 0.02)	1.00 (0.99; 1.02)
50	GLM-Binom	Adjusted	overall	diabetes	sex	90.0	595.0	40.0	554.0	0.110459	0.230917	NaN	6.324014e-01	0.11 (-0.34; 0.56)	1.12 (0.71; 1.76)
51	FISHER	Adjusted	overall	diabetes	PPARA	90.0	595.0	40.0	554.0	NaN	NaN	0.109807	5.709000e-01	NaN	NaN
52	GLM-Binom	Adjusted	overall	cvd	Intercept	272.0	595.0	40.0	554.0	-0.242553	0.330850	NaN	4.634856e-01	-0.24 (-0.89; 0.41)	0.78 (0.41; 1.50)
53	GLM-Binom	Adjusted	overall	cvd	PPARA	272.0	595.0	40.0	554.0	-0.116603	0.316153	NaN	7.122634e-01	-0.12 (-0.74; 0.50)	0.89 (0.48; 1.65)
54	GLM-Binom	Adjusted	overall	cvd	age	272.0	595.0	40.0	554.0	-0.000826	0.005581	NaN	8.823172e-01	-0.00 (-0.01; 0.01)	1.00 (0.99; 1.01)
55	GLM-Binom	Adjusted	overall	cvd	sex	272.0	595.0	40.0	554.0	0.227868	0.165660	NaN	1.689712e-01	0.23 (-0.10; 0.55)	1.26 (0.91; 1.74)
56	CHISQ	Adjusted	overall	cvd	PPARA	272.0	595.0	40.0	554.0	NaN	NaN	1.735910	4.198092e-01	NaN	NaN
57	FISHER	Adjusted	overall	cvd	PPARA	272.0	595.0	40.0	554.0	NaN	NaN	0.045933	4.175000e-01	NaN	NaN

Stratified pipeline run

The association pipeline can be run separately for each defined subgroup (stratum). This is configured via the stratify parameter in create_config, which maps stratum names to filter strings in column=value format.

By default (stratify_overall=True), an overall (unstratified) run is also performed alongside each stratum.

Stratification columns must be present in phenotypes.tsv. Here we stratify by hypertension (0.0 = no hypertension, 1.0 = hypertension), a binary outcome already in the phenotype file. The pipeline compares values via .astype(str), so filter strings use '0.0' and '1.0'.

[ ]:

import logging

# Use a fresh temporary directory for the stratified run
tmp_strat = tempfile.mkdtemp(prefix='marvel_strat_')
config_strat = os.path.join(tmp_strat, 'example_strat.cnf')

# Stratification uses columns from phenotypes.tsv.
# 'hypertension' is a binary outcome (0.0 / 1.0 float64) in phenotypes.tsv.
# The pipeline compares via .astype(str), so filter strings use '0.0' and '1.0'.
create_config(
    path=config_strat,
    extract_variants=True,
    geno_input=geno_input,
    variant_input={'variants': os.path.join(input_dir, 'gene_variants.tsv')},
    variant_output={CFLhs.VarOutput: os.path.join(tmp_strat, 'variant_carriers')},
    association_analysis=True,
    pheno_input={CFLhs.PhenoFile: pheno_file, CFLhs.CovFile: cov_file},
    con_tests=con_tests,
    cat_tests=cat_tests,
    bin_tests=bin_tests,
    covs={'Unadjusted': 'None', 'Adjusted': ['age', 'sex']},
    stratify={
        'hypertension_0': 'hypertension=0.0',
        'hypertension_1': 'hypertension=1.0',
    },
    id_column='id',
    raise_on_error=True,
    ref_column='a1',
    alt_column='a2',
    cat_column='gene',
    incl_var=False,
    output_path=tmp_strat,
)

# Suppress INFO logging to keep committed cell output deterministic
logging.disable(logging.INFO)
try:
    strat_pipeline = MARVELousPipeline.from_config_file(config_strat, verbose=False)
    strat_pipeline.run()
finally:
    logging.disable(logging.NOTSET)

# Verify output: list all relative paths under tmp_strat
all_paths = sorted(
    os.path.relpath(p, tmp_strat)
    for p in glob.glob(os.path.join(tmp_strat, '**'), recursive=True)
    if os.path.isfile(p)
)

# Display output structure (relative paths only — no temp dir prefix)
print("Stratified run output files:")
for p in all_paths:
    print(" ", p)

# Assert both strata produced output
assert any('hypertension_0' in p for p in all_paths), "hypertension_0 stratum output not found"
assert any('hypertension_1' in p for p in all_paths), "hypertension_1 stratum output not found"
print("\nAssertions passed: hypertension_0 and hypertension_1 strata both produced output.")

Cleanup

Remove temporary files created during this example.

[9]:

# Uncomment the following lines to remove the temporary output directories
shutil.rmtree(output_dir)
shutil.rmtree(tmp_strat)