Source code for beers.library_prep.adapter_ligation_step

from beers_utils.molecule import Molecule
import numpy as np
import pickle
from timeit import default_timer as timer
from beers.utilities.adapter_generator import AdapterGenerator
import beers_utils.cigar


[docs]class AdapterLigationStep: """ Adpater Ligation attaches adapters to each end of the molecules These are used for the PCR step to initiate PCR and include the sample identifying barcodes. This uses the adapters specified in the resources section as well as the sample i5/i7 barcodes in the samples section Each adapter flanks the corresponding sample barcode, which all flanks the original molecule So the molecule ends up looking like (5' to 3'):: (pre_i5_adapter) (i5) (post_i5_adapter) (molecule sequence) (pre_i7_adapter) (i7) (post_i7_adapter) The example config adapters and barcodes have been obtained from: https://support-docs.illumina.com/SHARE/AdapterSeq/Content/SHARE/AdapterSeq/TruSeq/UDIndexes.htm Config Example:: samples: '1': # Sample one configuration barcodes: # Barcodes are used by AdapterLigationStep # Each sample should have unique (i5, i7) pair of barcodes # These get sequenced to determine which sample the read came from # Typical examples of these can be founda at: # https://support-docs.illumina.com/SHARE/AdapterSeq/Content/SHARE/AdapterSeq/TruSeq/UDIndexes.htm# TODO: set this value i5: AGCGCTAG i7: AACCGCGG '2': # Sample 2 configuration, same as above barcodes: i5: GATATCGA i7: TTATAACC resources: pre_i5_adapter: AATGATACGGCGACCACCGAGATCTACAC post_i5_adapter: ACACTCTTTCCCTACACGACGCTCTTCCGATCT pre_i7_adapter: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC post_i7_adapter: ATCTCGTATGCCGTCTTCTGCTTG """ name = "Adapter Ligation Step" def __init__(self, parameters, global_config): self.parameters = parameters self.global_config = global_config print(f"{self.name} instantiated") def execute(self, molecule_packet, rng, log): print(f"{self.name} starting") sample = molecule_packet.sample # Adapters combine a fixed sequence (specified in 'resources' config) # with barcodes that are sample-specific i5_barcode = self.global_config['samples'][str(sample.sample_id)]['barcodes']['i5'] i7_barcode = self.global_config['samples'][str(sample.sample_id)]['barcodes']['i7'] adapter_5_prime = self.global_config['resources']['pre_i5_adapter'] + i5_barcode + self.global_config['resources']['post_i5_adapter'] # NOTE: on the 3' end, an "A" gets ligated onto the sequence first. This is done as a discrete step # in the actual TruSeq protocol, but we do it as part of this step here. adapter_3_prime = "A" + self.global_config['resources']['pre_i7_adapter'] + i7_barcode + self.global_config['resources']['post_i7_adapter'] # Ligate the adapters onto each molecule for molecule in molecule_packet.molecules: sequence = molecule.sequence molecule.sequence = adapter_5_prime + sequence + adapter_3_prime molecule.start = 1 molecule.cigar = f"{len(adapter_5_prime)}S{len(sequence)}M{len(adapter_3_prime)}S" new_source_start, new_source_cigar, new_source_strand = beers_utils.cigar.chain( molecule.start, molecule.cigar, "+", molecule.source_start, molecule.source_cigar, molecule.source_strand ) molecule.source_start = new_source_start molecule.source_cigar = new_source_cigar molecule.source_strand = new_source_strand log.write(molecule) return molecule_packet @staticmethod def validate(parameters, global_config): errors = [] for sample_id, sample in global_config['samples'].items(): if not 'i5' in sample['barcodes'] or not 'i7' in sample['barcodes']: errors.append(f"In sample {sample_id}, expected to find i5 and i7 adpters in config.") adapters = ['pre_i5_adapter', 'pre_i7_adapter', 'post_i5_adapter', 'post_i7_adapter'] if not all(adapter in global_config['resources'] for adapter in adapters): errors.append(f"In resources config, need to specify all of {adapters}") return errors