Source code for mapping.wasp_data_files
"""File path management for WASP mapping pipeline.
Provides the WaspDataFiles class for managing input/output paths
and auto-detecting file properties.
"""
from __future__ import annotations
import json
import logging
import re
from pathlib import Path
from pysam import VariantFile
from pysam.libcalignmentfile import AlignmentFile
logger = logging.getLogger(__name__)
[docs]
class WaspDataFiles:
"""Manage file paths and auto-detection for WASP mapping pipeline."""
[docs]
def __init__(
self,
bam_file: str | Path,
variant_file: str | Path,
is_paired: bool | None = None,
samples: str | list[str] | None = None,
is_phased: bool | None = None,
out_dir: str | Path | None = None,
temp_loc: str | Path | None = None,
) -> None:
# User input files
self.bam_file = bam_file
self.variant_file = variant_file
self.is_paired = is_paired
self.samples = samples
self.is_phased = is_phased
self.out_dir = out_dir
self.temp_loc = temp_loc
# Autoparse args
if self.is_paired is None:
with AlignmentFile(str(self.bam_file), "r") as bam:
self.is_paired = next(bam.head(1)).is_paired
# Process samples as list
if self.samples is None:
self.is_phased = False # No phasing w/o sample
elif isinstance(self.samples, str):
# Check if sample file or comma delim string
if Path(self.samples).is_file():
with open(self.samples) as sample_file:
self.samples = [l.strip() for l in sample_file]
else:
self.samples = [s.strip() for s in self.samples.split(",")]
# self.samples = self.samples.split(",") # should i strip spaces?
# At this point, self.samples is normalized to Optional[List[str]]
# Check if variant file is phased (only works for VCF/BCF, not PGEN)
if self.is_phased is None and self.samples is not None:
# TODO GOTTA FIX THIS TO CHECK IF PHASED
# Note: This only works for VCF/BCF files, PGEN doesn't store phase in the same way
variant_path = Path(self.variant_file)
suffix = variant_path.suffix.lower()
if suffix in (".vcf", ".bcf") or str(variant_path).lower().endswith(".vcf.gz"):
with VariantFile(str(self.variant_file), "r") as vcf:
vcf_samps = next(vcf.fetch()).samples
samps_phased = [vcf_samps[s].phased for s in self.samples]
if all(samps_phased):
self.is_phased = True
else:
# TODO GOTTA WARN UNPHASED BAD
# TODO WARN SOME UNPHASED WHILE OTHERS PHASED
self.is_phased = False
else:
# PGEN format - assume phased (user should specify if not)
self.is_phased = True
if self.out_dir is None:
self.out_dir = Path(bam_file).parent # change to cwd?
# TODO handle temp loc, maybe make default if temp not made?
# Temporary workaround until figure out temp dir options
if self.temp_loc is None:
self.temp_loc = self.out_dir
# Generate intermediate files
# Maybe use easy defalt names if temp loc in use
# Handle different variant file extensions for prefix extraction
variant_name = Path(self.variant_file).name
if variant_name.endswith(".vcf.gz"):
variant_prefix = variant_name[:-7] # Remove .vcf.gz
elif variant_name.endswith(".pgen"):
variant_prefix = variant_name[:-5] # Remove .pgen
else:
variant_prefix = re.split(r"\.vcf|\.bcf", variant_name)[0]
bam_prefix = Path(self.bam_file).name.rsplit(".bam")[0]
self.variant_prefix = variant_prefix
self.bam_prefix = bam_prefix
self.vcf_bed = str(Path(self.temp_loc) / f"{variant_prefix}.bed")
self.remap_reads = str(Path(self.temp_loc) / f"{bam_prefix}_remap_reads.txt")
self.intersect_file = str(
Path(self.temp_loc) / f"{bam_prefix}_{variant_prefix}_intersect.bed"
)
self.to_remap_bam = str(Path(self.out_dir) / f"{bam_prefix}_to_remap.bam")
self.keep_bam = str(Path(self.out_dir) / f"{bam_prefix}_keep.bam")
# Relevant output reads
if self.is_paired:
self.remap_fq1 = str(Path(self.out_dir) / f"{bam_prefix}_swapped_alleles_r1.fq")
self.remap_fq2: str | None = str(
Path(self.out_dir) / f"{bam_prefix}_swapped_alleles_r2.fq"
)
else:
self.remap_fq1 = str(Path(self.out_dir) / f"{bam_prefix}_swapped_alleles.fq")
self.remap_fq2 = None
[docs]
def write_data(self, out_file: str | Path | None = None) -> None:
"""Export Relevant Files to JSON
Used for parsing post remapping step easily
:param out_file: name for output file if not using default
:type out_file: str, optional
"""
if out_file is None:
out_file = str(Path(str(self.out_dir)) / f"{self.bam_prefix}_wasp_data_files.json")
with open(out_file, "w") as json_out:
json.dump(self.__dict__, json_out)
logger.info("File data written to JSON: %s", out_file)