Source code for alphaviz.preprocessing

#!python
"""
This module provides functions that are helping to preprocess the data.
"""

import re
import logging
import pandas as pd


[docs]def get_mq_unique_proteins( filepath: str ) -> list: """Extract unique "Protein names" from the specified MaxQuant output file. Parameters ---------- filepath : str Full path to the file. Returns ------- list A list of unique protein names from the specified file. """ if filepath.endswith('.txt'): sep = '\t' else: raise ValueError('The given file does not have an original .txt extension.') with open(filepath) as filelines: i = 0 filename_col_index = int() filename_data = [] for line in filelines: line = line.split(sep) if i == 0: try: filename_col_index = line.index('Protein names') except ValueError: raise ValueError('The given file does not contain the "Protein names" column.') else: filename_data.append(line[filename_col_index]) i += 1 unique_proteins = set(filename_data) sorted_unique_proteins = sorted(list(unique_proteins)) return sorted_unique_proteins
[docs]def filter_df( df: pd.DataFrame, pattern: str, column: str, software: str ) -> pd.DataFrame: """Filter the data frame based on the pattern (any value) in the specified column. Parameters ---------- df : pd.DataFrame The original data frame. pattern : str The string to be used to filter of the data frame column. column : str The column to be used to filter. software: str The name of the software tool where the filtering is used. Returns ------- pd.DataFrame The filtered data frame. """ if not pattern: return df if software == 'maxquant': output = df[df[column].str.contains(pattern, na=False)] else: output = df[df[column] == pattern] return output
[docs]def sort_naturally( line: str, reverse: bool = False ) -> str: """Sort the string natural to humans, e.g. 4,1,6,11 will be sorted as 1,4,6,11 and not like 1,11,4,6. Parameters ---------- line : str The string to be sorted. reverse : bool Whether to apply the reverse option or not. Defaults: False. Returns ------- str A naturally sorted string. """ def _convert(value): if value.isdigit(): return int(value) return value.lower() alphanum_key = lambda key: [_convert(c) for c in re.split('([0-9]+)', key)] return sorted(line, key=alphanum_key, reverse=reverse)
[docs]def get_aa_seq( protein_id: str, fasta, # pyteomics.fasta.IndexedUniProt object ) -> str: """Extract the leading razor protein sequence for the list of proteinIDs of the protein group from the pyteomics.fasta.IndexedUniProt object. Parameters ---------- protein_ids : str String containing the proteinIDs of all protein isoforms. fasta : pyteomics.fasta.IndexedUniProt object The pyteomics.fasta.IndexedUniProt object. Returns ------- str Protein sequence for the leading razor protein, e.g. from the list of proteinIDs 'Q15149;Q15149-7;Q15149-9;Q15149-5' the AA sequence for protein Q15149 will be returned. """ # for id in sorted(protein_ids.split(';'), reverse=True): try: protein_seq = fasta.get_by_id(protein_id).sequence return protein_seq except KeyError: logging.info(f"The provided protein ID {id} is missing in the fasta file.")
[docs]def get_mq_ms2_scan_data( msms: pd.DataFrame, selected_msms_scan: int, raw_data, # AlphaTims TimsTOF object, precursor_id: int ) -> pd.DataFrame: """Extract MS2 data as a data frame for the specified MSMS scan number and precursor ID from the 'msms.txt' MQ output file and raw file. Parameters ---------- msms : pd.DataFrame Pre-loaded 'msms.txt' MQ output file. selected_msms_scan : int MSMS scan number. raw_data : AlphaTims TimsTOF object AlphaTims TimsTOF object. precursor_id : int The identifier of the precursor. Returns ------- pd.DataFrame For the specified MSMS scan and precursor ID, the extracted data frame contains the following columns: - 'mz_values' - 'intensity_values' - 'ions' - 'wrong_dev_value': whether the mass_deviation specified in the MQ table was incorrect. """ msms_filtered = msms[msms['Scan number'] == selected_msms_scan] msms_filtered_df = pd.DataFrame( { 'ions': msms_filtered.Matches.values[0].split(';'), 'mz': msms_filtered.Masses.values[0].split(';'), 'mass_dev_Da': msms_filtered['Mass deviations [Da]'].values[0].split(';'), 'mass_dev_ppm': msms_filtered['Mass deviations [ppm]'].values[0].split(';') } ) for col in ['mz', 'mass_dev_Da', 'mass_dev_ppm']: msms_filtered_df[col] = msms_filtered_df[col].astype(float) data = raw_data[:, :, precursor_id].loc[:, ['mz_values', 'intensity_values']] # can be slightly faster by only retrieving the indices and converting directly to mz values and intensities data['ions'] = '-' data['wrong_dev_value'] = False for row in msms_filtered_df.itertuples(): # inefficient implementation. Sorting both arrays by mz should allow you to do it much faster. ion_index = data.mz_values.sub(row.mz).abs().idxmin() mass_dev_ppm_calc = ((row.mz + row.mass_dev_Da - data.loc[ion_index, 'mz_values']) * 10**6) / data.loc[ion_index, 'mz_values'] # to think how not to set a fixed threshold? ppm_threshold = 100 if abs(mass_dev_ppm_calc) < ppm_threshold: data.loc[ion_index, 'ions'] = row.ions msms_filtered_df.loc[msms_filtered_df.ions == row.ions, 'mass_dev_ppm'] = mass_dev_ppm_calc data.sort_values(['ions', 'intensity_values'], ascending=True, inplace=True) data_merged = pd.merge(data, msms_filtered_df, on='ions', how='left') return data_merged.drop('mz', axis=1)
[docs]def get_identified_ions( values: list, sequence: str, ion_type: str ) -> list: """For the specified peptide sequence extract all identified in the experiment ions and based on the specified ion_type return a list of booleans containing information for the b-ions whether the peptide is breaking after aligned amino acid or for the y-ion whether is breaking before aligned amino acid. E.g. for the peptide 'NTINHN' having the unique ion values ['b2-H2O', 'b2', 'b3', 'b5-NH3'] it will return the following list of booleans: [False,True,True,False,True,False]. Parameters ---------- values : list The list of all unique identified ions for the peptide in the experiment. sequence : str Peptide sequence. ion : str Ion type, e.g. 'b' or 'y'. Other ion types are not implemented. Returns ------- list List of peptide length of booleans with True for the presenting ion and False for a missing one. """ ions = [False] * (len(sequence)) all_ions = list(set([ion.split('-')[0] for ion in set(values) if ion_type in ion])) for each in all_ions: if ion_type == 'b': ions[int(each.replace(ion_type, '')) - 1] = True elif ion_type == 'y': ions[-int(each.replace(ion_type, ''))] = True else: raise NotImplementedError(f"The specified ion type {ion_type} is not implemented in the current version.") return ions
[docs]def convert_diann_mq_mod( sequence: str ) -> str: # this function is taken from the AlphaMap package and modified """Convert DIA-NN style modifications to MaxQuant style modifications. Args: sequence (str): A peptide sequence with a DIA-NN style modification. Returns: str: A peptide sequence with MaxQuant style modification. """ modif_convers_dict = { '(UniMod:1)': '[Acetyl ({})]', '(UniMod:2)': '[Amidated ({})]', '(UniMod:4)': '[Carbamidomethyl ({})]', '(UniMod:5)': '[Carbamyl ({})]', '(UniMod:7)': '[Deamidation ({})]', '(UniMod:21)': '[Phospho ({})]', '(UniMod:23)': '[Dehydrated ({})]', '(UniMod:26)': '[Pyro-carbamidomethyl ({})]', '(UniMod:27)': '[Glu->pyro-Glu]', '(UniMod:28)': '[Gln->pyro-Glu]', '(UniMod:30)': '[Cation:Na ({})]', '(UniMod:34)': '[Methyl ({})]', '(UniMod:35)': '[Oxidation ({})]', '(UniMod:36)': '[Dimethyl ({})]', '(UniMod:37)': '[Trimethyl ({})]', '(UniMod:40)': '[Sulfo ({})]', '(UniMod:55)': '[Cys-Cys]', '(UniMod:121)': '[GlyGly ({})]', '(UniMod:254)': '[Delta:H(2)C(2) ({})]', '(UniMod:312)': '[Cysteinyl]', '(UniMod:345)': '[Trioxidation ({})]', '(UniMod:408)': '[Hydroxyproline]', '(UniMod:425)': '[Dioxidation ({})]', '(UniMod:526)': '[Dethiomethyl ({})]', '(UniMod:877)': '[QQTGG ({})]', } mods = re.findall(r'\(UniMod:\d+\)', sequence) if mods: for mod in mods: posit = re.search(r'\(UniMod:\d+\)', sequence) i = posit.start() if i == 0: add_aa = 'N-term' elif posit.end() == len(sequence): add_aa = 'C-term' else: add_aa = sequence[i-1] if mod == '(UniMod:7)': if add_aa in 'NQ': add_aa = 'NQ' elif mod == '(UniMod:21)': if add_aa in 'STY': add_aa = 'STY' elif mod == '(UniMod:23)': if add_aa in 'ST': add_aa = 'ST' elif mod == '(UniMod:30)': if add_aa in 'DE': add_aa = 'DE' elif mod == '(UniMod:34)': if add_aa in 'KR': add_aa = 'KR' elif mod == '(UniMod:36)': if add_aa in 'KR': add_aa = 'KR' elif mod == '(UniMod:40)': if add_aa in 'STY': add_aa = 'STY' elif mod == '(UniMod:425)': if add_aa in 'MW': add_aa = 'MW' if mod in modif_convers_dict.keys(): sequence = sequence.replace(mod, modif_convers_dict.get(mod).format(add_aa), 1) else: logging.info(f"This modification {mod} can't be converted.") return sequence
[docs]def convert_diann_ap_mod( sequence: str ) -> str: """Convert DIA-NN style modifications to AlphaPept style modifications. Args: sequence (str): A peptide sequence with a DIA-NN style modification. Returns: str: A peptide sequence with AlphaPept style modification. """ modif_convers_dict = { '(UniMod:1)': 'a', # '[Acetyl ({})]' '(UniMod:2)': 'am', # '[Amidated ({})]' '(UniMod:4)': 'c', # '[Carbamidomethyl ({})]' '(UniMod:7)': 'deam', # '[Deamidation ({})]' '(UniMod:21)': 'p', # '[Phospho ({})]' '(UniMod:26)': 'cm', # '[Pyro-carbamidomethyl ({})]', '(UniMod:27)': 'pg', # '[Glu->pyro-Glu]' '(UniMod:28)': 'pg', # '[Gln->pyro-Glu]' '(UniMod:35)': 'ox', # '[Oxidation ({})]' } mods = re.findall(r'\(UniMod:\d+\)', sequence) if mods: for mod in mods: posit = re.search(r'\(UniMod:\d+\)', sequence) i = posit.start() if i != 0: i -= 1 if mod in modif_convers_dict.keys(): sequence = sequence.replace(mod, '', 1) sequence = sequence[:i] + modif_convers_dict[mod] + sequence[i:] else: logging.info(f"This modification {mod} can't be converted.") return sequence
[docs]def get_protein_info( fasta: dict, protein_ids: str ): """Get the name and the length of the protein(s) from the fasta file specifying the protein id(s). Parameters ---------- fasta : pyteomics.fasta.IndexedUniProt object The Pyteomics object contains information about all proteins from the .fasta file. protein_ids : str The list of the protein IDs separated by comma. Returns ------- tuple of strings The name and the length of the specified protein(s). """ protein_names = [] protein_seq_lens = [] for protein_id in protein_ids.replace(';', ' ').split(): try: protein_names.append(fasta.get_by_id(protein_id).description['name']) except KeyError: logging.info(f"The protein id {protein_id} is not found in the fasta file.") try: protein_seq_lens.append(str(len(fasta.get_by_id(protein_id).sequence))) except KeyError: logging.info(f"The sequence length for the protein {protein_id} is not found in the fasta file.") return ','.join(protein_names), ','.join(protein_seq_lens)
[docs]def get_protein_info_from_fastaheader( string: str, **kwargs ): """Extract information about protein IDs, protein names and gene names from the "Fasta headers" column of the MQ output tables. Parameters ---------- string : str A 'Fasta header' string from the MQ output table for one protein group (e.g. from the proteinGroups.txt file). E.g. a complex one: 'sp|Q3SY84|K2C71_HUMAN Keratin, type II cytoskeletal 71 OS=Homo sapiens OX=9606 GN=KRT71 PE=1 SV=3;;sp|Q14CN4|K2C72_HUMAN Keratin, type II cytoskeletal 72 OS=Homo sapiens OX=9606 GN=KRT72 PE=1 SV=2;;;sp|Q7RTS7|K2C74_HUMAN Keratin, type II cytoskeletal 74 OS' Returns ------- a tuple of strings The function returns a tuple of three strings containing information about the protein names, protein IDs and gene names. """ protein_names = [] protein_ids = [] genes = [] for protein in string.split(';'): if protein: try: protein_names.append(re.findall(r'\s(.+)OS', protein)[0].strip()) except: protein_names.append("") try: protein_ids.append(protein.split('|')[1]) except: protein_ids.append("") try: genes.append(protein.split()[0].split('|')[-1].split('_')[0]) except: genes.append("") protein_names = ";".join(protein_names) if protein_names else None protein_ids = ";".join(protein_ids) if protein_ids else None genes = ";".join(genes) if genes else None return protein_names, protein_ids, genes