Source code for counting.run_counting_sc

"""Single-cell variant counting pipeline."""

from __future__ import annotations

import re
from pathlib import Path

from .count_alleles_sc import make_count_matrix

# local imports
from .filter_variant_data import intersect_vcf_region, parse_intersect_region_new, vcf_to_bed
from .run_counting import tempdir_decorator


[docs] class WaspCountSC: """Container for single-cell WASP counting pipeline configuration. Attributes ---------- bam_file : str Path to the BAM alignment file. variant_file : str Path to the variant file (VCF, BCF, or PGEN). barcode_file : str Path to cell barcode file. feature_file : str | None Optional path to feature/region file. samples : list[str] | None List of sample IDs to process. out_file : str Output file path for AnnData. """
[docs] def __init__( self, bam_file: str, variant_file: str, barcode_file: str, feature_file: str | None, samples: str | list[str] | None = None, use_region_names: bool = False, out_file: str | None = None, temp_loc: str | None = None, ) -> None: # TODO: ALSO ACCEPT .h5 # User input files self.bam_file = bam_file self.variant_file = variant_file self.barcode_file = barcode_file # Maybe could be optional? self.feature_file = feature_file self.use_region_names = use_region_names # Make sure samples turned into str list # Ideally single sample for single cell normalized_samples: list[str] | None if isinstance(samples, str): # Check if sample file or comma delim string if Path(samples).is_file(): with open(samples) as sample_file: normalized_samples = [l.strip() for l in sample_file] else: normalized_samples = [s.strip() for s in samples.split(",")] else: normalized_samples = samples self.samples: list[str] | None = normalized_samples # parse output? self.out_file: str = out_file if out_file is not None else str(Path.cwd() / "allele_counts.h5ad") # Failsafe if decorator doesnt create temp_loc self.temp_loc: str = temp_loc if temp_loc is not None else str(Path.cwd()) # Parse variant file prefix (handle VCF, BCF, PGEN) variant_name = Path(self.variant_file).name if variant_name.endswith(".vcf.gz"): variant_prefix = variant_name[:-7] # Remove .vcf.gz elif variant_name.endswith(".pgen"): variant_prefix = variant_name[:-5] # Remove .pgen else: variant_prefix = re.split(r"\.vcf|\.bcf", variant_name)[0] self.variant_prefix = variant_prefix # Filtered variant output self.vcf_bed = str(Path(self.temp_loc) / f"{variant_prefix}.bed") # Parse feature file self.feature_type = None # maybe use a boolean flag instead if self.feature_file is not None: f_ext = "".join(Path(self.feature_file).suffixes) if re.search(r"\.(.*Peak|bed)(?:\.gz)?$", f_ext, re.I): self.feature_type = "regions" self.intersect_file = str( Path(self.temp_loc) / f"{variant_prefix}_intersect_regions.bed" ) self.is_gene_file = False elif re.search(r"\.g[tf]f(?:\.gz)?$", f_ext, re.I): self.feature_type = "genes" self.intersect_file = str( Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed" ) self.is_gene_file = True gtf_prefix = re.split(r".g[tf]f", Path(self.feature_file).name)[0] self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed") self.use_feature_names = True # Use feature attributes as region names elif re.search(r"\.gff3(?:\.gz)?$", f_ext, re.I): self.feature_type = "genes" self.intersect_file = str( Path(self.temp_loc) / f"{variant_prefix}_intersect_genes.bed" ) self.is_gene_file = True gtf_prefix = re.split(r".gff3", Path(self.feature_file).name)[0] self.gtf_bed = str(Path(self.temp_loc) / f"{gtf_prefix}.bed") self.use_feature_names = True # Use feature attributes as feature names else: raise ValueError( f"Invalid feature file type. Expected .bed, .gtf, or .gff3, got: {self.feature_file}" ) else: self.intersect_file = self.vcf_bed # TODO UPDATE THIS WHEN I ADD AUTOPARSERS if self.is_gene_file: # Possible edge case of vcf and gtf prefix conflict if self.vcf_bed == self.gtf_bed: self.gtf_bed = str(Path(self.temp_loc) / "genes.bed")
[docs] @tempdir_decorator def run_count_variants_sc( bam_file: str, variant_file: str, barcode_file: str, feature_file: str | None = None, samples: str | list[str] | None = None, use_region_names: bool = False, out_file: str | None = None, temp_loc: str | None = None, ) -> None: """Run single-cell variant counting pipeline. Parameters ---------- bam_file : str Path to the BAM alignment file with cell barcodes. variant_file : str Path to the variant file (VCF, BCF, or PGEN). barcode_file : str Path to cell barcode file (one barcode per line). feature_file : str | None, optional Path to feature/region file (BED, GTF, or GFF3). samples : str | list[str] | None, optional Sample ID(s) to process. use_region_names : bool, optional Whether to use region names from the feature file. out_file : str | None, optional Output file path. Defaults to 'allele_counts.h5ad'. temp_loc : str | None, optional Directory for temporary files. Returns ------- None Results are written to out_file as AnnData. """ # Stores file names count_files = WaspCountSC( bam_file, variant_file, barcode_file=barcode_file, feature_file=feature_file, samples=samples, use_region_names=use_region_names, out_file=out_file, temp_loc=temp_loc, ) # Create intermediary files # Maybe change include_gt based on preparse? vcf_to_bed( vcf_file=count_files.variant_file, out_bed=count_files.vcf_bed, samples=count_files.samples, include_gt=True, ) assert count_files.feature_file is not None intersect_vcf_region( vcf_file=count_files.vcf_bed, region_file=count_files.feature_file, out_file=count_files.intersect_file, ) # TODO: handle use_region_names better df = parse_intersect_region_new( intersect_file=count_files.intersect_file, samples=count_files.samples, use_region_names=use_region_names, region_col=None, ) # TODO: handle case where barcode file contains multiple columns with open(count_files.barcode_file) as file: bc_dict = {line.rstrip(): i for i, line in enumerate(file)} # Generate Output adata = make_count_matrix( bam_file=count_files.bam_file, df=df, bc_dict=bc_dict, include_samples=count_files.samples ) # Write outputs adata.write_h5ad(count_files.out_file)
# TODO: include output options, (ie MTX, dense?)