Source code for beers.library_prep.first_strand_synthesis_step

import numpy as np

from beers_utils.molecule import Molecule
from beers_utils.general_utils import GeneralUtils
import beers_utils.cigar

[docs]class FirstStrandSynthesisStep: ''' First Strand Synthesis Step A library preparation step that places random primers onto the RNA molecules and then extends the primers to obtain cDNA. Primer location can be biased by read sequence (see the 'position probability matrix' parameter). Even when not biased, priming can be either perfect or imperfect. Perfect means that the first base will always have a primer bind to it and therefore the entire molecule will be perfectly duplicated as cDNA. Imperfect means that primers are placed randomly (at a rate of 'primers_per_kb') and the 5'-most one is extended to create the molecule, potentially losing some of the 5' end of the molecule. Configuration example:: parameters: # If 'perfect_priming' is true, then we always prime # exactly on the 5' most end of the molecule and the # cDNA matches perfectly the original molecule # If true, all other parameters are ignored. perfect_priming: false # PPM matrix describing biases of the priming sites # Values must sum to 1 in each base. Length of the PPM # determines the primer lengths. # See Kasper et all, 2010 PMID: 20395217 for example # observation of these biases in fragment start positions. # If no bias is desired, set all values to 0.25. position_probability_matrix: A: [0.50, 0.1, 0.40, 0.30, 0.25, 0.15] C: [0.20, 0.5, 0.3 , 0.25, 0.25, 0.15] G: [0.15, 0.1, 0.15, 0.25, 0.25, 0.20] T: [0.15, 0.3, 0.15, 0.20, 0.25, 0.50] # Rate of priming: approximate number of primers that # will bind (if bias-free) per kilobase of the fragment # Higher numbers will loose less of the 5' side. primes_per_kb: 50 ''' name = "First Strand Synthsis Step" def __init__(self, parameters, global_config): self.parameters = parameters self.global_config = global_config self.position_probability_matrix = self.get_position_probability_matrix(self.parameters) self.primer_length = self.position_probability_matrix.shape[1] self.primes_per_kb = self.parameters['primes_per_kb'] self.perfect_priming = self.parameters['perfect_priming'] print("First Strand cDNA Synthesis Step instantiated.") def execute(self, molecule_packet, rng, log): print("First strand cDNA synthesis step starting...") cdna_sample = [] for molecule in molecule_packet.molecules: if len(molecule.sequence) < self.primer_length: continue # Can't prime a sequence this short, so it'll just drop if self.perfect_priming: # Perfectly prime at the first base, regardless of sequence primed_site = 0 else: # First choose how many places will be primed number_of_primed_sites = rng.binomial(n = len(molecule.sequence), p = self.primes_per_kb/1000) if number_of_primed_sites == 0: number_of_primed_sites = 1 # TODO: we force everything to be primed at least once, but maybe we should allow non-priming? # Then weight all possible priming spots according to how they match the given sequence seq_bases = GeneralUtils.sequence_to_matrix(molecule.sequence) weights = np.array([np.lib.stride_tricks.sliding_window_view(seq_bases[i], self.primer_length) * self.position_probability_matrix[i, :] for i in range(4)]).sum(axis=0).prod(axis=1) if weights.sum() == 0: # Should only occur if a sequence lacks all the normal bases, which is an error condition. We log it here # and choose a random priming site print(f"Weighting failed unexpectedly on the following molecule {molecule.molecule_id} with sequence {molecule.sequence}") p = None else: p = weights/weights.sum() # Then choose the priming sites and take the 5'-most one priming_sites = rng.choice(len(weights), p=p, size=number_of_primed_sites) primed_site = min(priming_sites) cdna_start = primed_site + 1 cdna_cigar = f"{len(molecule.sequence)-cdna_start+1}M" cdna_source_start, cdna_source_cigar, cdna_source_strand = beers_utils.cigar.chain( cdna_start, cdna_cigar, "-", molecule.source_start, molecule.source_cigar, molecule.source_strand ) cdna_seq = GeneralUtils.create_complement_strand(molecule.sequence[cdna_start - 1:]) cdna_molecule = Molecule( molecule.molecule_id + '.cdna' , cdna_seq, start = cdna_start, cigar = cdna_cigar, transcript_id = molecule.transcript_id, source_start = cdna_source_start, source_cigar = cdna_source_cigar, source_strand = cdna_source_strand, source_chrom = molecule.source_chrom, ) cdna_sample.append(cdna_molecule) log.write(cdna_molecule) print("First strand cDNA synthesis step complete.") molecule_packet.molecules = cdna_sample return molecule_packet @staticmethod def get_position_probability_matrix(parameters): position_probability_matrix = np.array([ parameters["position_probability_matrix"][base] for base in GeneralUtils.BASE_ORDER ]) return position_probability_matrix @staticmethod def validate(parameters, global_config): errors = [] if 'position_probability_matrix' not in parameters: errors.append("Must contain 'position_probability_matrix'") else: ppm = parameters['position_probability_matrix'] okay = True length = None for base in GeneralUtils.BASE_ORDER: if base not in ppm: errors.append(f"Must specify {base} in 'position_probability_matrix'") continue probs = ppm[base] if not isinstance(probs, list) or not all(isinstance(x, (float, int)) for x in probs): errors.append(f"Position probability matrix entry for {base} must be list of numbers") okay = False continue if length is None: length = len(probs) elif length != len(probs): errors.append(f"All position probability matrix entries must have the same length: had {len(probs)} for {base} but expected {length}") okay = False if okay: ppm = FirstStrandSynthesisStep.get_position_probability_matrix(parameters) sums = ppm.sum(axis=0) if not np.isclose(sums, 1).all(): errors.append(f"Position probability matrix entries must sum to 1 across all the four bases at each position") if not (ppm > 0).all(): errors.append("Position probability matrix entries must all be positive") if 'primes_per_kb' not in parameters: errors.append("Must contain 'primes_per_kb' parameter") elif not isinstance(parameters['primes_per_kb'], (float, int)): errors.append("primes_per_kb must be a number") elif parameters['primes_per_kb'] <= 0: errors.append("primer_per_kb must be positive") if 'perfect_priming' not in parameters: errors.append("Must specify 'perfect_priming' (as true/false)") elif not isinstance(parameters['perfect_priming'], bool): errors.append(f"Expected 'perfect_priming' to be a bool but instead was {parameters['perfect_priming']}") return errors