Source code for alphaviz.io

#!python
"""
This module provides functions to read MQ/DiaNN/AlphaPept output files and other IO supplementary functions.
"""

import logging
import os
import pandas as pd
import alphaviz.preprocessing


[docs]def read_file( filepath: str, column_names: list ) -> pd.DataFrame: """Enable reading the file and retrieving the values from the specified columns. Compared to function pd.read_csv() it gains significant time if the file is huge and is only a few ms slower for small files. Parameters ---------- filepath : str Full path to the file. column_names : list A list of column names to be read. Returns ------- pd.DataFrame This data frame contains data from all columns of the specified file. """ file_ext = os.path.splitext(filepath)[-1] if file_ext == '.csv': sep = ',' elif file_ext in ['.tsv', '.txt']: sep = '\t' with open(filepath) as filelines: i = 0 filename_col_index = [] filename_data = [] for line in filelines: if i == 0: # for the first line to extract the index of the specified columns line = line.strip().split(sep) filename_col_index = [line.index(col) for col in column_names] else: # use these indices for all other rows to extract the data line = line.split(sep) filename_data.append([line[ind] for ind in filename_col_index]) i += 1 data = pd.DataFrame(filename_data, columns=column_names) return data
[docs]def import_mq_evidence( filepath: str, experiment: str ) -> pd.DataFrame: """Read some columns from the output file evidence.txt of MaxQuant software. Parameters ---------- filepath : str Full path to the evidence.txt file. experiment : str The name of the experiment. Returns ------- pd.DataFrame The output data frame contains information about the following MQ columns: - 'Sequence', - 'Length' ('int' type), - 'Acetyl (Protein N-term)' (renamed to 'Acetylation (N-term)') ('int' type), - 'Oxidation (M)' ('int' type), - 'Proteins', - 'Retention time' ('float:.4d' type), - 'Mass' ('float:.4d' type), - 'm/z' ('float:.4d' type), - 'Charge' ('category' type), - 'Intensity' ('int' type), - '1/K0' ('float:.4d' type), - 'MS/MS count' ('category' type), - 'MS/MS scan number' ('int' type), - 'Gene names' ('category' type), - 'Score' (renamed to 'Andromeda score') ('int' type), - 'Raw file' ('category' type), - 'Uncalibrated mass error [ppm]' ('float:.4d' type), - 'Mass error [ppm]' ('float:.4d' type), - 'Modified sequence'. Renamed columns are marked as is the output data type of all columns. The rows of the data frame with missing 'MS/MS scan number' values are dropped. """ chunk = pd.read_csv(filepath, chunksize=1000000, sep='\t', low_memory=False) data_raw_file = pd.concat(chunk) data_raw_file = data_raw_file[data_raw_file['Raw file'] == experiment] data_raw_file.rename( columns={ 'Score': 'Andromeda score', 'K0': '1/K0', }, inplace=True ) data_raw_file.dropna( axis=0, subset=['MS/MS scan number', 'Proteins'], inplace=True ) if 'Gene names' not in data_raw_file.columns: data_raw_file['Gene names'] = data_raw_file['Proteins'].apply( lambda x: ';'.join([entry.split('|')[-1].split('_')[0] for entry in x.split(';') if 'sp' in entry]) ) for col in ['Charge', 'MS/MS count', 'Gene names', 'Raw file']: data_raw_file[col] = data_raw_file[col].astype('category') for col in ['Retention time', 'Mass', 'm/z', '1/K0', 'Uncalibrated mass error [ppm]', 'Mass error [ppm]']: data_raw_file[col] = data_raw_file[col].astype(float).round(4) for col in ['MS/MS scan number', 'Length', 'Intensity', 'Andromeda score']: data_raw_file[col] = pd.to_numeric( data_raw_file[col], downcast='integer' ) data_raw_file.dropna( axis=0, subset=['MS/MS scan number', 'Gene names'], inplace=True ) first_column_names = ['Charge', 'm/z', 'Mass', '1/K0', 'Retention time'] columns = list(data_raw_file.columns.drop(first_column_names)) columns[1:1] = first_column_names data_raw_file = data_raw_file[columns] return data_raw_file
[docs]def import_mq_protein_groups( filepath: str, experiment: str ) -> pd.DataFrame: """Read the output file proteinGroups.txt of MaxQuant software. Parameters ---------- filepath : str Full path to the proteinGroups.txt file. experiment : str The name of the experiment. Returns ------- # pd.DataFrame # The output data frame contains information about the following MQ columns: # - 'Protein IDs', # - 'Protein names', # - 'Gene names', # - 'Number of proteins' (renamed to '# proteins'), # - 'Mol. weight [kDa]' (renamed to 'Mol weight, kDa'), # - f'Peptides Exp_{experiment}' (renamed to '(EXP) # peptides'), # - f'Unique peptides Exp_{experiment}' (renamed to '(EXP) # unique peptides'), # - f'Sequence coverage Exp_{experiment} [%]' (renamed to '(EXP) Seq coverage, %'), # - 'MS/MS count' (renamed to '# MS/MS'), # - 'Sequence lengths', # Renamed columns are marked. The rows of the data frame with missing 'Gene names' values are dropped. """ data_common = pd.read_csv(filepath, sep='\t', low_memory=False) try: data_common.drop([col for col in data_common.columns if 'IDs' in col and col != 'Protein IDs'] + ['Best MS/MS', 'Peptide is razor'], inplace=True, axis=1) except: pass data_common.rename( columns={ 'Number of proteins': '# proteins', 'Mol. weight [kDa]': 'Mol weight, kDa', f'Peptides Exp_{experiment}': '(EXP) # peptides', f'Unique peptides Exp_{experiment}': '(EXP) # unique peptides', f'Sequence coverage Exp_{experiment} [%]': '(EXP) Seq coverage, %', 'MS/MS count': '# MS/MS' }, inplace=True ) data_common.dropna( axis=0, subset=['Fasta headers'], inplace=True ) if '(EXP) # peptides' not in data_common.columns: data_common.rename( columns={ 'Peptides': '(EXP) # peptides', }, inplace=True ) try: data_common.dropna( axis=0, subset=['(EXP) # peptides'], inplace=True ) data_common['(EXP) # peptides'] = data_common['(EXP) # peptides'].astype('int') except KeyError: pass try: data_common.Score = data_common.Score.astype(float) except: data_common.Score = data_common.Score.apply(lambda x: float(x) if x.replace('.', '', 1).isdigit() else None) if 'Gene names' not in data_common.columns: data_common[['Protein names', 'Protein IDs', 'Gene names']] = data_common.apply(lambda x: alphaviz.preprocessing.get_protein_info_from_fastaheader(x['Fasta headers']), axis=1, result_type='expand') data_common.dropna( axis=0, subset=['Gene names', 'Protein IDs', 'Score'], inplace=True ) first_columns = [ 'Protein IDs', 'Protein names', 'Gene names', '# proteins', 'Mol weight, kDa', '# MS/MS', 'Sequence lengths', ] first_columns.extend([col for col in data_common.columns if '(EXP)' in col]) data_common = data_common[first_columns + sorted(list(set(data_common.columns).difference(first_columns)))] return data_common
[docs]def import_mq_all_peptides( filepath: str ) -> pd.DataFrame: """Read some columns from the output file allPeptides.txt of MaxQuant software. Parameters ---------- filepath : str Full path to the allPeptides.txt file. Returns ------- pd.DataFrame The output data frame contains information about the following MQ columns: - 'Pasef MS/MS IDs' ('list' type), - 'MS/MS scan number' ('int' type). The rows of the data frame with missing 'MS/MS scan number' values are dropped. """ maxquant_all_peptides_columns = [ 'Pasef MS/MS IDs', 'MS/MS scan number' ] data_common = read_file(filepath, maxquant_all_peptides_columns) data_common.columns = [col.strip() for col in data_common.columns] data_common['MS/MS scan number'] = data_common['MS/MS scan number'].str.strip() data_common = data_common[data_common['MS/MS scan number'] != ''] data_common['MS/MS scan number'] = data_common['MS/MS scan number'].astype(int) data_common['Pasef MS/MS IDs'] = data_common['Pasef MS/MS IDs'].str.split(';') return data_common
[docs]def import_mq_msms( filepath: str, experiment: str ) -> pd.DataFrame: """Read some columns from the output file msms.txt of MaxQuant software. Parameters ---------- filepath : str Full path to the msms.txt file. Returns ------- pd.DataFrame The output data frame contains information about the following MQ columns: - 'Scan number' ('int' type), - 'Matches', - 'Masses', - 'Mass deviations [Da]', - 'Mass deviations [ppm]'. """ try: maxquant_msms_columns = [ 'Raw file', 'Scan number', 'Matches', 'Masses', 'Mass deviations [Da]', 'Mass deviations [ppm]' ] data_common = read_file(filepath, maxquant_msms_columns) except ValueError: maxquant_msms_columns = [ 'Raw file', 'Scan number', 'Matches', 'Masses', 'Mass Deviations [Da]', 'Mass Deviations [ppm]' ] data_common = read_file(filepath, maxquant_msms_columns) data_common = data_common[data_common['Raw file'] == experiment] data_common.columns = [col.strip().replace('Deviations', 'deviations') for col in data_common.columns] data_common['Scan number'] = data_common['Scan number'].astype('int') return data_common
[docs]def import_mq_summary( filepath: str ) -> pd.DataFrame: """Read the output file summary.txt of MaxQuant software. Parameters ---------- filepath : str Full path to the msms.txt file. Returns ------- pd.DataFrame The output data frame contains summary information of all the experiments. """ data_common = pd.read_csv(filepath, sep='\t', low_memory=False) data_common.dropna(subset=['MS'], axis=0, inplace=True) return data_common
[docs]def import_mq_output( necessary_files: list, path_mq_output_folder: str, experiment: str ): """Read all specified files from the MQ output folder and returns the data frames for each of the files. Parameters ---------- necessary_files : list A list of strings containing the names of the MQ output files with extensions, e.g. ['allPeptides.txt', 'msms.txt']. path_mq_output_folder : str Path to the MaxQuant output folder with all output files needed. experiment : str The name of the experiment. Returns ------- generator For each of the specified MQ output files, the function returns a pandas data frame with the extracted information. """ file_func_dict = { 'allPeptides.txt': import_mq_all_peptides, 'msms.txt': import_mq_msms, 'evidence.txt': import_mq_evidence, 'proteinGroups.txt': import_mq_protein_groups, 'summary.txt': import_mq_summary, } for file in necessary_files: file_path = os.path.join( path_mq_output_folder, file ) if file in ['allPeptides.txt', 'summary.txt']: df = file_func_dict[file]( file_path ) else: df = file_func_dict[file]( file_path, experiment ) logging.info(f"MaxQuant output {file} file is uploaded.") yield df
[docs]def get_filenames_from_directory( directory: str, extensions_list: list ) -> list: """Search for files with the specified extension in the repository and return a list of all file names with that extention. Parameters ---------- directory : str Path to the repository to search in. extensions_list : list A list of extensions, e.g. ['d', 'hdf']. Returns ------- list The list of filtered file names based on their extensions. """ file_names = [file for file in os.listdir(directory) if file.split('.')[-1] in extensions_list] return file_names
[docs]def read_fasta( filepath: str ) -> dict: """Read the fasta file using the pyteomics package. Parameters ---------- filepath : str Full path to the .fasta file. Returns ------- pyteomics.fasta.IndexedUniProt object The output object allows access to all available information in the fasta file using the protein ID. """ import pyteomics.fasta fasta = pyteomics.fasta.IndexedUniProt(filepath) return fasta
[docs]def import_diann_stats( filepath: str, experiment: str ): """Read the DIANN output .stats.tsv file. Parameters ---------- filepath : str Full path to the .stats.tsv file. experiment : str The name of the experiment. Returns ------- pd.DataFrame The output data frame contains summary information about the whole experiment. """ diann_overview = pd.read_csv(filepath, sep='\t', low_memory=False) return diann_overview
[docs]def create_diann_proteins_table( diann_df: pd.DataFrame, fasta: object ): """Extract information about genes, proteins and protein groups from the loaded main DIANN output .tsv file. Parameters ---------- diann_df : pd.DataFrame The original data frame after loading the main .tsv DIANN output file and filter by the experiment name. fasta : pyteomics.fasta.IndexedUniProt The object containing information about all proteins from the fasta file. Returns ------- pd.DataFrame The output data frame contains information about genes, proteins and proteins groups. """ columns = [col for col in diann_df.columns if 'PG' in col or 'Protein' in col or 'Genes' in col] cols_to_remove = ['Protein.Group', 'Protein.Ids', 'Protein.Names'] for col in cols_to_remove: columns.remove(col) proteins = diann_df.groupby(columns).agg({ 'Protein.Ids': lambda x: ','.join(set(x)), 'MS2.Scan': lambda x: len(set(x)), 'Stripped.Sequence': lambda x: len(set(x)) }).reset_index() proteins.rename(columns={ 'MS2.Scan': '# MS/MS', 'Stripped.Sequence': '(EXP) # peptides', 'Genes': 'Gene names', 'Protein.Ids': 'Protein IDs' }, inplace=True) proteins['# proteins'] = proteins['Protein IDs'].apply(lambda x: len(x.split(','))) proteins['Protein names'], proteins['Sequence lengths'] = zip( *proteins['Protein IDs'].apply(lambda x: alphaviz.preprocessing.get_protein_info(fasta, x))) first_columns = ['Protein IDs', 'Protein names', 'Gene names', '# proteins', '(EXP) # peptides', '# MS/MS', 'Sequence lengths'] proteins = proteins[first_columns + sorted(list(set(proteins.columns).difference(first_columns)))] return proteins
[docs]def create_diann_peptides_table( diann_df: pd.DataFrame ): """Extract information about peptides from the loaded main DIANN output .tsv file. Parameters ---------- diann_df : pd.DataFrame The original data frame after loading the main .tsv DIANN output file and filter by the experiment name. Returns ------- pd.DataFrame The output data frame contains information about peptides. """ peptides = diann_df.copy() columns = [ col for col in peptides.columns if 'PG' not in col and 'Protein' not in col and 'Genes' not in col and 'GG' not in col ] columns.extend(['Genes']) peptides = diann_df[columns[2:]].copy() peptides['Length'] = peptides['Stripped.Sequence'].str.len() peptides.rename(columns={ 'MS2.Scan': 'MS/MS scan number', 'Genes': 'Gene names', 'Precursor.Charge': 'Charge', 'Stripped.Sequence': 'Sequence' }, inplace=True) peptides['Sequence_AP_mod'] = peptides['Modified.Sequence'].apply( alphaviz.preprocessing.convert_diann_ap_mod ) peptides['Modified.Sequence'] = peptides['Modified.Sequence'].apply( alphaviz.preprocessing.convert_diann_mq_mod ) peptides['m/z'] = 0.0 first_columns = [ 'Modified.Sequence', 'Length', 'm/z', 'RT', 'Predicted.RT', 'Charge', 'IM', 'Predicted.IM' ] peptides = peptides[ first_columns + sorted(list( set(peptides.columns).difference(first_columns)) ) ] return peptides
[docs]def import_diann_output( path_diann_output_folder: str, experiment: str, fasta: object ): """Load two files from the DiaNN output folder and returns the data frames containing information about proteins, peptides, and summary information about the whole experiment. Parameters ---------- path_diann_output_folder : str Path to the DIANN output folder with all output files needed. experiment : str The name of the experiment. fasta : pyteomics.fasta.IndexedUniProt The object containing information about all proteins from the fasta file. Returns ------- list of pd.DataFrames The function returns three pandas data frame with the extracted information about proteins, peptides, and summary information about the whole experiment. """ diann_output_file, diann_stats_file = sorted(get_filenames_from_directory( path_diann_output_folder, 'tsv'), key=len)[:2] diann_df = pd.read_csv(os.path.join(path_diann_output_folder, diann_output_file), sep='\t', low_memory=False) diann_df = diann_df[diann_df.Run == experiment] diann_proteins = create_diann_proteins_table(diann_df, fasta) diann_peptides = create_diann_peptides_table(diann_df) diann_overview = import_diann_stats(os.path.join(path_diann_output_folder, diann_stats_file), experiment) return diann_proteins, diann_peptides, diann_overview, diann_output_file
[docs]def create_ap_proteins_table( ap_df: pd.DataFrame, fasta: object ): ap_df[['Protein names', 'Protein IDs', 'Gene names']] = ap_df.apply( lambda x: alphaviz.preprocessing.get_protein_info_from_fastaheader( x['protein_group'] ), axis=1, result_type='expand' ) ap_df[['Protein names', 'Sequence lengths']] = ap_df.apply( lambda x: alphaviz.preprocessing.get_protein_info( fasta, x['Protein IDs'] ), axis=1, result_type='expand' ) columns = [col for col in ap_df.columns if 'protein' in col] \ + ['sequence', 'Protein names', 'Protein IDs', 'Gene names', 'Sequence lengths'] agg_dict = dict.fromkeys(columns, 'max') agg_dict['sequence'] = 'count' grouped_ap_df = ap_df.groupby( 'index_protein_group', as_index=False )[columns].agg(agg_dict) grouped_ap_df.rename( columns={'sequence': '(EXP) # peptides'}, inplace=True ) grouped_ap_df['# proteins'] = grouped_ap_df['protein_idx'].apply( lambda x: len(x.split(','))) grouped_ap_df['# MS/MS'] = grouped_ap_df['(EXP) # peptides'] first_columns = [ 'Protein IDs', 'Protein names', 'Gene names', '# proteins', '(EXP) # peptides', '# MS/MS', 'Sequence lengths' ] proteins = grouped_ap_df[ first_columns + sorted(list( set(grouped_ap_df.columns).difference(first_columns)) ) ] return proteins
[docs]def create_ap_peptides_table( ap_df: pd.DataFrame ): peptides = ap_df.copy() columns = [ col for col in peptides.columns if 'protein' not in col and 'Protein' not in col and col != 'Sequence lengths' ] peptides = peptides[columns] peptides.rename(columns={ 'n_AA': 'Length', 'charge': 'Charge', 'sequence_naked': 'Sequence', 'parent': 'MS/MS scan number', 'sequence': 'Sequence_AP_mod', 'mz': 'm/z', 'mass': 'Mass', 'mobility': 'IM', 'rt': 'RT', }, inplace=True) peptides['Modified.Sequence'] = peptides['Sequence_AP_mod'] first_columns = [ 'Modified.Sequence', 'Length', 'm/z', 'RT', 'Charge', 'Mass', 'IM' ] peptides = peptides[ first_columns + sorted(list( set(peptides.columns).difference(first_columns)) ) ] return peptides
[docs]def import_alphapept_output( path_ap_output_folder: str, experiment: str, fasta: object ): ap_output_file = 'results_peptides.csv' ap_df = pd.read_csv( os.path.join(path_ap_output_folder, ap_output_file), low_memory=False ) ap_df = ap_df[ap_df.shortname == experiment] cols_to_remove = ['filename', 'shortname', 'sample_group'] ap_df.drop(columns=cols_to_remove, axis=1, inplace=True) ap_proteins = create_ap_proteins_table(ap_df, fasta) ap_peptides = create_ap_peptides_table(ap_df) return ap_proteins, ap_peptides