Source code for beers.library_prep.second_strand_synthesis_step

import numpy as np

from beers_utils.molecule import Molecule
from beers_utils.general_utils import GeneralUtils
import beers_utils.cigar

[docs]class SecondStrandSynthesisStep:
    '''
    Second Strand Synthesis Step

    A library preparation step that places random primers onto the first strand cDNA
    and then extends the primers to obtain double-stranded DNA.

    Primer location can be biased by read sequence (see the 'position probability matrix' parameter).
    Even when not biased, priming can be either perfect or imperfect. Perfect means that the first
    base will always have a primer bind to it and therefore the entire molecule will be perfectly
    duplicated as cDNA. Imperfect means that primers are placed randomly (at a rate of 'primers_per_kb')
    and the 5'-most one is extended to create the molecule, potentially losing some of the 5' end of the
    molecule.
    '''

    name = "Second Strand Synthsis Step"

    def __init__(self, parameters, global_config):
        self.parameters = parameters
        self.global_config = global_config
        self.position_probability_matrix = self.get_position_probability_matrix(self.parameters)
        self.primer_length = self.position_probability_matrix.shape[1]
        self.primes_per_kb = self.parameters['primes_per_kb']
        self.perfect_priming = self.parameters['perfect_priming']
        print("Second Strand cDNA Synthesis Step instantiated.")

    def execute(self, molecule_packet, rng, log):
        print("Second strand cDNA synthesis step starting...")
        cdna_sample = []
        for molecule in molecule_packet.molecules:
            if len(molecule.sequence) < self.primer_length:
                continue # Can't prime a sequence this short, so it'll just drop

            if self.perfect_priming:
                # Perfectly prime at the first base, regardless of sequence
                primed_site = 0
            else:
                # First choose how many places will be primed
                number_of_primed_sites = rng.binomial(n = len(molecule.sequence), p = self.primes_per_kb/1000)
                if number_of_primed_sites == 0:
                    number_of_primed_sites = 1 # TODO: we force everything to be primed at least once, but maybe we should allow non-priming?

                # Then weight all possible priming spots according to how they match the given sequence
                seq_bases = GeneralUtils.sequence_to_matrix(molecule.sequence)
                weights = np.array([np.lib.stride_tricks.sliding_window_view(seq_bases[i], self.primer_length) * self.position_probability_matrix[i, :]
                                                        for i in range(4)]).sum(axis=0).prod(axis=1)

                if weights.sum() == 0:
                    # Should only occur if a sequence lacks all the normal bases, which is an error condition. We log it here
                    # and choose a random priming site
                    print(f"Weighting failed unexpectedly on the following molecule {molecule.molecule_id} with sequence {molecule.sequence}")
                    p = None
                else:
                    p = weights/weights.sum()

                # Then choose the priming sites and take the 5'-most one
                priming_sites = rng.choice(len(weights), p=p, size=number_of_primed_sites)
                primed_site = min(priming_sites)

            cdna_start = primed_site + 1
            cdna_cigar = f"{len(molecule.sequence)-cdna_start+1}M"
            cdna_source_start, cdna_source_cigar, cdna_source_strand = beers_utils.cigar.chain(
                    cdna_start, cdna_cigar, "-",
                    molecule.source_start, molecule.source_cigar, molecule.source_strand
            )
            cdna_seq = GeneralUtils.create_complement_strand(molecule.sequence[cdna_start - 1:])
            cdna_molecule = Molecule(
                    molecule.molecule_id + '.cdna' ,
                    cdna_seq,
                    start = cdna_start,
                    cigar = cdna_cigar,
                    transcript_id = molecule.transcript_id,
                    source_start = cdna_source_start,
                    source_cigar = cdna_source_cigar,
                    source_strand = cdna_source_strand,
                    source_chrom = molecule.source_chrom,
            )
            cdna_sample.append(cdna_molecule)
            log.write(cdna_molecule)

        print("Second strand cDNA synthesis step complete.")
        molecule_packet.molecules = cdna_sample
        return molecule_packet

    @staticmethod
    def get_position_probability_matrix(parameters):
        position_probability_matrix = np.array([
            parameters["position_probability_matrix"][base]
                for base in GeneralUtils.BASE_ORDER
        ])
        return position_probability_matrix


    @staticmethod
    def validate(parameters, global_config):
        errors = []
        if 'position_probability_matrix' not in parameters:
            errors.append("Must contain 'position_probability_matrix'")
        else:
            ppm = parameters['position_probability_matrix']
            okay = True
            length = None
            for base in GeneralUtils.BASE_ORDER:
                if base not in ppm:
                    errors.append(f"Must specify {base} in 'position_probability_matrix'")
                    continue
                probs = ppm[base]
                if not isinstance(probs, list) or not all(isinstance(x, (float, int)) for x in probs):
                    errors.append(f"Position probability matrix entry for {base} must be list of numbers")
                    okay = False
                    continue

                if length is None:
                    length = len(probs)
                elif length != len(probs):
                    errors.append(f"All position probability matrix entries must have the same length: had {len(probs)} for {base} but expected {length}")
                    okay = False
            if okay:
                ppm =  SecondStrandSynthesisStep.get_position_probability_matrix(parameters)
                sums = ppm.sum(axis=0)
                if not np.isclose(sums, 1).all():
                    errors.append(f"Position probability matrix entries must sum to 1 across all the four bases at each position")
                if not (ppm > 0).all():
                    errors.append("Position probability matrix entries must all be positive")

        if 'primes_per_kb' not in parameters:
            errors.append("Must contain 'primes_per_kb' parameter")
        elif not isinstance(parameters['primes_per_kb'], (float, int)):
            errors.append("primes_per_kb must be a number")
        elif parameters['primes_per_kb'] <= 0:
            errors.append("primer_per_kb must be positive")

        if 'perfect_priming' not in parameters:
            errors.append("Must specify 'perfect_priming' (as true/false)")
        elif not isinstance(parameters['perfect_priming'], bool):
            errors.append(f"Expected 'perfect_priming' to be a bool but instead was {parameters['perfect_priming']}")

        return errors