Source code for analysis.run_analysis

"""Allelic imbalance analysis pipeline.

Main entry point for running the beta-binomial allelic imbalance analysis
using the Rust-accelerated backend.
"""

from __future__ import annotations

import logging
from csv import reader
from pathlib import Path
from typing import Literal

import pandas as pd

# Rust analysis (required; no Python fallback)
try:
    from wasp2_rust import analyze_imbalance as rust_analyze_imbalance
except ImportError:
    rust_analyze_imbalance = None

logger = logging.getLogger(__name__)


[docs] class WaspAnalysisData: """Container for allelic imbalance analysis configuration. Attributes ---------- count_file : str | Path Path to the count TSV file. region_col : str | None Column name for grouping variants by region. groupby : str | None Column name for additional grouping (e.g., parent gene). out_file : str Output file path for results. phased : bool Whether to use phased genotype information. model : Literal["single", "linear"] Dispersion model type. min_count : int Minimum total allele count threshold. pseudocount : int Pseudocount to add to allele counts. """
[docs] def __init__( self, count_file: str | Path, min_count: int | None = None, pseudocount: int | None = None, phased: bool | None = None, model: str | None = None, out_file: str | None = None, region_col: str | None = None, groupby: str | None = None, ) -> None: # User input data self.count_file = count_file self.region_col = region_col self.groupby = groupby # group by region or parent? self.out_file = out_file # TODO parse vcf for phased instead of default unphased self.phased: bool = bool(phased) # Default to single dispersion model if model == "linear": self.model: Literal["single", "linear"] = "linear" else: self.model = "single" # Default min count of 10, pseudocount of 1 self.min_count: int = 10 if min_count is None else min_count self.pseudocount: int = 1 if pseudocount is None else pseudocount # Read header only for validation with open(self.count_file) as f: count_cols = next(reader(f, delimiter="\t")) # 7 columns at minimum, 10 at maximum # 3required : chr, pos, ref, alt # 3 optional: <GT>, <region>, <parent> # 3 required: ref_count, alt_count, other_count # [chr, pos, ref, alt, <GT>, <region>, <parent>, ref_c, alt_c, other_c] if "GT" in count_cols: min_cols = 8 region_idx = 5 else: min_cols = 7 region_idx = 4 # Check regions if self.region_col is None: if len(count_cols) > min_cols: self.region_col = count_cols[region_idx] # By default group by feature rather than parent? if self.groupby is not None: # If denoting to group by feature if (self.region_col is None) or (self.groupby == self.region_col): self.groupby = None elif (len(count_cols) > (min_cols + 1)) and self.groupby in { count_cols[region_idx + 1], "Parent", "parent", }: self.groupby = count_cols[region_idx + 1] # Set group else: # Maybe throw error instead logger.warning("%s not found in columns %s", self.groupby, count_cols) self.groupby = None # Create default outfile if self.out_file is None: self.out_file = str(Path.cwd() / "ai_results.tsv") # do this after
[docs] def run_ai_analysis( count_file: str | Path, min_count: int | None = None, pseudocount: int | None = None, phased: bool | None = None, model: str | None = None, out_file: str | None = None, region_col: str | None = None, groupby: str | None = None, ) -> None: """Run allelic imbalance analysis pipeline. Parameters ---------- count_file : str | Path Path to TSV file with allele counts. min_count : int | None, optional Minimum total count threshold, by default 10. pseudocount : int | None, optional Pseudocount to add, by default 1. phased : bool | None, optional Use phased genotype information, by default False. model : str | None, optional Dispersion model ('single' or 'linear'), by default 'single'. out_file : str | None, optional Output file path, by default 'ai_results.tsv'. region_col : str | None, optional Column name for grouping variants. groupby : str | None, optional Additional grouping column. Raises ------ RuntimeError If Rust analysis extension is not available. """ # Store analysis data and params ai_files = WaspAnalysisData( count_file, min_count=min_count, pseudocount=pseudocount, phased=phased, model=model, out_file=out_file, region_col=region_col, groupby=groupby, ) # Run analysis pipeline (Rust only) if rust_analyze_imbalance is None: raise RuntimeError( "Rust analysis extension not available. Build it with " "`maturin develop --release` in the WASP2 env." ) results = rust_analyze_imbalance( str(ai_files.count_file), min_count=ai_files.min_count, pseudocount=ai_files.pseudocount, method=ai_files.model, ) ai_df = pd.DataFrame(results) # Maybe give option to sort or not sort by pval if "fdr_pval" in ai_df.columns: ai_df = ai_df.sort_values(by="fdr_pval", ascending=True) # Write results ai_df.to_csv(ai_files.out_file, sep="\t", header=True, index=False)