Source code for alphaviz.io

#!python
"""
This module provides functions to read MQ/DiaNN/AlphaPept output files and other IO supplementary functions.
"""

import logging
import os
import pandas as pd
import alphaviz.preprocessing


[docs]def read_file(
    filepath: str,
    column_names: list
) -> pd.DataFrame:
    """Enable reading the file and retrieving the values from the
    specified columns. Compared to function pd.read_csv() it gains significant time if the file is huge and is only a few ms slower for small files.

    Parameters
    ----------
    filepath : str
        Full path to the file.
    column_names : list
        A list of column names to be read.

    Returns
    -------
    pd.DataFrame
        This data frame contains data from all columns of the specified file.
    """
    file_ext = os.path.splitext(filepath)[-1]
    if file_ext == '.csv':
        sep = ','
    elif file_ext in ['.tsv', '.txt']:
        sep = '\t'
    with open(filepath) as filelines:
        i = 0
        filename_col_index = []
        filename_data = []

        for line in filelines:
            if i == 0:  # for the first line to extract the index of the specified columns
                line = line.strip().split(sep)
                filename_col_index = [line.index(col) for col in column_names]
            else:  # use these indices for all other rows to extract the data
                line = line.split(sep)
                filename_data.append([line[ind] for ind in filename_col_index])
            i += 1

    data = pd.DataFrame(filename_data, columns=column_names)

    return data


[docs]def import_mq_evidence(
    filepath: str,
    experiment: str
) -> pd.DataFrame:
    """Read some columns from the output file evidence.txt of MaxQuant software.

    Parameters
    ----------
    filepath : str
        Full path to the evidence.txt file.
    experiment : str
        The name of the experiment.

    Returns
    -------
    pd.DataFrame
        The output data frame contains information about the following MQ columns:
            - 'Sequence',
            - 'Length' ('int' type),
            - 'Acetyl (Protein N-term)' (renamed to 'Acetylation (N-term)') ('int' type),
            - 'Oxidation (M)' ('int' type),
            - 'Proteins',
            - 'Retention time' ('float:.4d' type),
            - 'Mass' ('float:.4d' type),
            - 'm/z' ('float:.4d' type),
            - 'Charge' ('category' type),
            - 'Intensity' ('int' type),
            - '1/K0' ('float:.4d' type),
            - 'MS/MS count' ('category' type),
            - 'MS/MS scan number' ('int' type),
            - 'Gene names' ('category' type),
            - 'Score' (renamed to 'Andromeda score') ('int' type),
            - 'Raw file' ('category' type),
            - 'Uncalibrated mass error [ppm]' ('float:.4d' type),
            - 'Mass error [ppm]' ('float:.4d' type),
            - 'Modified sequence'.
        Renamed columns are marked as is the output data type of all columns. The rows of the data frame with missing 'MS/MS scan number' values are dropped.
    """
    chunk = pd.read_csv(filepath, chunksize=1000000, sep='\t', low_memory=False)
    data_raw_file = pd.concat(chunk)
    data_raw_file = data_raw_file[data_raw_file['Raw file'] == experiment]
    data_raw_file.rename(
        columns={
            'Score': 'Andromeda score',
            'K0': '1/K0',
        },
        inplace=True
    )
    data_raw_file.dropna(
        axis=0,
        subset=['MS/MS scan number', 'Proteins'],
        inplace=True
    )
    if 'Gene names' not in data_raw_file.columns:
        data_raw_file['Gene names'] = data_raw_file['Proteins'].apply(
            lambda x: ';'.join([entry.split('|')[-1].split('_')[0] for entry in x.split(';') if 'sp' in entry])
        )
    for col in ['Charge', 'MS/MS count', 'Gene names', 'Raw file']:
        data_raw_file[col] = data_raw_file[col].astype('category')
    for col in ['Retention time', 'Mass', 'm/z', '1/K0', 'Uncalibrated mass error [ppm]', 'Mass error [ppm]']:
        data_raw_file[col] = data_raw_file[col].astype(float).round(4)
    for col in ['MS/MS scan number', 'Length', 'Intensity', 'Andromeda score']:
        data_raw_file[col] = pd.to_numeric(
            data_raw_file[col],
            downcast='integer'
        )
    data_raw_file.dropna(
        axis=0,
        subset=['MS/MS scan number', 'Gene names'],
        inplace=True
    )
    first_column_names = ['Charge', 'm/z', 'Mass', '1/K0', 'Retention time']
    columns = list(data_raw_file.columns.drop(first_column_names))
    columns[1:1] = first_column_names
    data_raw_file = data_raw_file[columns]
    return data_raw_file


[docs]def import_mq_protein_groups(
    filepath: str,
    experiment: str
) -> pd.DataFrame:
    """Read the output file proteinGroups.txt of MaxQuant software.

    Parameters
    ----------
    filepath : str
        Full path to the proteinGroups.txt file.
    experiment : str
        The name of the experiment.

    Returns
    -------
    # pd.DataFrame
    #     The output data frame contains information about the following MQ columns:
    #         - 'Protein IDs',
    #         - 'Protein names',
    #         - 'Gene names',
    #         - 'Number of proteins' (renamed to '# proteins'),
    #         - 'Mol. weight [kDa]' (renamed to 'Mol weight, kDa'),
    #         - f'Peptides Exp_{experiment}' (renamed to '(EXP) # peptides'),
    #         - f'Unique peptides Exp_{experiment}' (renamed to '(EXP) # unique peptides'),
    #         - f'Sequence coverage Exp_{experiment} [%]' (renamed to '(EXP) Seq coverage, %'),
    #         - 'MS/MS count' (renamed to '# MS/MS'),
    #         - 'Sequence lengths',
    #     Renamed columns are marked. The rows of the data frame with missing 'Gene names' values are dropped.
    """
    data_common = pd.read_csv(filepath, sep='\t', low_memory=False)

    try:
        data_common.drop([col for col in data_common.columns if 'IDs' in col and col != 'Protein IDs'] + ['Best MS/MS', 'Peptide is razor'], inplace=True, axis=1)
    except:
        pass

    data_common.rename(
        columns={
            'Number of proteins': '# proteins',
            'Mol. weight [kDa]': 'Mol weight, kDa',
            f'Peptides Exp_{experiment}': '(EXP) # peptides',
            f'Unique peptides Exp_{experiment}': '(EXP) # unique peptides',
            f'Sequence coverage Exp_{experiment} [%]': '(EXP) Seq coverage, %',
            'MS/MS count': '# MS/MS'
        },
        inplace=True
    )
    data_common.dropna(
        axis=0,
        subset=['Fasta headers'],
        inplace=True
    )
    if '(EXP) # peptides' not in data_common.columns:
        data_common.rename(
            columns={
                'Peptides': '(EXP) # peptides',
            },
            inplace=True
        )
    try:
        data_common.dropna(
            axis=0,
            subset=['(EXP) # peptides'],
            inplace=True
        )
        data_common['(EXP) # peptides'] = data_common['(EXP) # peptides'].astype('int')
    except KeyError:
        pass

    try:
        data_common.Score = data_common.Score.astype(float)
    except:
        data_common.Score = data_common.Score.apply(lambda x: float(x) if x.replace('.', '', 1).isdigit() else None)

    if 'Gene names' not in data_common.columns:
        data_common[['Protein names', 'Protein IDs', 'Gene names']] = data_common.apply(lambda x: alphaviz.preprocessing.get_protein_info_from_fastaheader(x['Fasta headers']), axis=1, result_type='expand')
    data_common.dropna(
        axis=0,
        subset=['Gene names', 'Protein IDs', 'Score'],
        inplace=True
    )

    first_columns = [
        'Protein IDs',
        'Protein names',
        'Gene names',
        '# proteins',
        'Mol weight, kDa',
        '# MS/MS',
        'Sequence lengths',
    ]
    first_columns.extend([col for col in data_common.columns if '(EXP)' in col])

    data_common = data_common[first_columns + sorted(list(set(data_common.columns).difference(first_columns)))]

    return data_common


[docs]def import_mq_all_peptides(
    filepath: str
) -> pd.DataFrame:
    """Read some columns from the output file allPeptides.txt of MaxQuant software.

    Parameters
    ----------
    filepath : str
        Full path to the allPeptides.txt file.

    Returns
    -------
    pd.DataFrame
        The output data frame contains information about the following MQ columns:
            - 'Pasef MS/MS IDs' ('list' type),
            - 'MS/MS scan number' ('int' type).
        The rows of the data frame with missing 'MS/MS scan number' values are dropped.
    """
    maxquant_all_peptides_columns = [
        'Pasef MS/MS IDs',
        'MS/MS scan number'
    ]
    data_common = read_file(filepath, maxquant_all_peptides_columns)
    data_common.columns = [col.strip() for col in data_common.columns]
    data_common['MS/MS scan number'] = data_common['MS/MS scan number'].str.strip()
    data_common = data_common[data_common['MS/MS scan number'] != '']
    data_common['MS/MS scan number'] = data_common['MS/MS scan number'].astype(int)
    data_common['Pasef MS/MS IDs'] = data_common['Pasef MS/MS IDs'].str.split(';')
    return data_common


[docs]def import_mq_msms(
    filepath: str,
    experiment: str
) -> pd.DataFrame:
    """Read some columns from the output file msms.txt of MaxQuant software.

    Parameters
    ----------
    filepath : str
        Full path to the msms.txt file.

    Returns
    -------
    pd.DataFrame
        The output data frame contains information about the following MQ columns:
            - 'Scan number' ('int' type),
            - 'Matches',
            - 'Masses',
            - 'Mass deviations [Da]',
            - 'Mass deviations [ppm]'.
    """
    try:
        maxquant_msms_columns = [
            'Raw file',
            'Scan number',
            'Matches',
            'Masses',
            'Mass deviations [Da]',
            'Mass deviations [ppm]'
        ]
        data_common = read_file(filepath, maxquant_msms_columns)
    except ValueError:
        maxquant_msms_columns = [
            'Raw file',
            'Scan number',
            'Matches',
            'Masses',
            'Mass Deviations [Da]',
            'Mass Deviations [ppm]'
        ]
        data_common = read_file(filepath, maxquant_msms_columns)
    data_common = data_common[data_common['Raw file'] == experiment]
    data_common.columns = [col.strip().replace('Deviations', 'deviations') for col in data_common.columns]
    data_common['Scan number'] = data_common['Scan number'].astype('int')
    return data_common


[docs]def import_mq_summary(
    filepath: str
) -> pd.DataFrame:
    """Read the output file summary.txt of MaxQuant software.

    Parameters
    ----------
    filepath : str
        Full path to the msms.txt file.

    Returns
    -------
    pd.DataFrame
        The output data frame contains summary information of all the experiments.
    """
    data_common = pd.read_csv(filepath, sep='\t', low_memory=False)
    data_common.dropna(subset=['MS'], axis=0, inplace=True)
    return data_common


[docs]def import_mq_output(
    necessary_files: list,
    path_mq_output_folder: str,
    experiment: str
):
    """Read all specified files from the MQ output folder and returns the data frames for each of the files.

    Parameters
    ----------
    necessary_files : list
        A list of strings containing the names of the MQ output files with extensions, e.g. ['allPeptides.txt', 'msms.txt'].
    path_mq_output_folder : str
        Path to the MaxQuant output folder with all output files needed.
    experiment : str
        The name of the experiment.

    Returns
    -------
    generator
        For each of the specified MQ output files, the function returns a pandas data frame with the extracted information.
    """
    file_func_dict = {
        'allPeptides.txt': import_mq_all_peptides,
        'msms.txt': import_mq_msms,
        'evidence.txt': import_mq_evidence,
        'proteinGroups.txt': import_mq_protein_groups,
        'summary.txt': import_mq_summary,
    }
    for file in necessary_files:
        file_path = os.path.join(
            path_mq_output_folder,
            file
        )
        if file in ['allPeptides.txt', 'summary.txt']:
            df = file_func_dict[file](
                file_path
            )
        else:
            df = file_func_dict[file](
                file_path,
                experiment
            )
        logging.info(f"MaxQuant output {file} file is uploaded.")
        yield df


[docs]def get_filenames_from_directory(
    directory: str,
    extensions_list: list
) -> list:
    """Search for files with the specified extension in the repository and return a list of all file names with that extention.

    Parameters
    ----------
    directory : str
        Path to the repository to search in.
    extensions_list : list
        A list of extensions, e.g. ['d', 'hdf'].

    Returns
    -------
    list
        The list of filtered file names based on their extensions.
    """
    file_names = [file for file in os.listdir(directory) if file.split('.')[-1] in extensions_list]
    return file_names


[docs]def read_fasta(
    filepath: str
) -> dict:
    """Read the fasta file using the pyteomics package.

    Parameters
    ----------
    filepath : str
        Full path to the .fasta file.

    Returns
    -------
    pyteomics.fasta.IndexedUniProt object
        The output object allows access to all available information in the fasta file using the protein ID.
    """
    import pyteomics.fasta
    fasta = pyteomics.fasta.IndexedUniProt(filepath)
    return fasta


[docs]def import_diann_stats(
    filepath: str,
    experiment: str
):
    """Read the DIANN output .stats.tsv file.

    Parameters
    ----------
    filepath : str
        Full path to the .stats.tsv file.
    experiment : str
        The name of the experiment.

    Returns
    -------
    pd.DataFrame
        The output data frame contains summary information about the whole experiment.
    """
    diann_overview = pd.read_csv(filepath, sep='\t', low_memory=False)
    return diann_overview


[docs]def create_diann_proteins_table(
    diann_df: pd.DataFrame,
    fasta: object
):
    """Extract information about genes, proteins and protein groups from the loaded main DIANN output .tsv file.

    Parameters
    ----------
    diann_df : pd.DataFrame
        The original data frame after loading the main .tsv DIANN output file and filter by the experiment name.
    fasta : pyteomics.fasta.IndexedUniProt
        The object containing information about all proteins from the fasta file.

    Returns
    -------
    pd.DataFrame
        The output data frame contains information about genes, proteins and proteins groups.
    """
    columns = [col for col in diann_df.columns if 'PG' in col or 'Protein' in col or 'Genes' in col]
    cols_to_remove = ['Protein.Group', 'Protein.Ids', 'Protein.Names']
    for col in cols_to_remove:
        columns.remove(col)
    proteins = diann_df.groupby(columns).agg({
        'Protein.Ids': lambda x: ','.join(set(x)),
        'MS2.Scan': lambda x: len(set(x)),
        'Stripped.Sequence': lambda x: len(set(x))
    }).reset_index()
    proteins.rename(columns={
        'MS2.Scan': '# MS/MS',
        'Stripped.Sequence': '(EXP) # peptides',
        'Genes': 'Gene names',
        'Protein.Ids': 'Protein IDs'
    }, inplace=True)
    proteins['# proteins'] = proteins['Protein IDs'].apply(lambda x: len(x.split(',')))
    proteins['Protein names'], proteins['Sequence lengths'] = zip(
        *proteins['Protein IDs'].apply(lambda x: alphaviz.preprocessing.get_protein_info(fasta, x)))
    first_columns = ['Protein IDs', 'Protein names', 'Gene names', '# proteins', '(EXP) # peptides', '# MS/MS', 'Sequence lengths']
    proteins = proteins[first_columns + sorted(list(set(proteins.columns).difference(first_columns)))]
    return proteins


[docs]def create_diann_peptides_table(
    diann_df: pd.DataFrame
):
    """Extract information about peptides from the loaded main DIANN output .tsv file.

    Parameters
    ----------
    diann_df : pd.DataFrame
        The original data frame after loading the main .tsv DIANN output file and filter by the experiment name.

    Returns
    -------
    pd.DataFrame
        The output data frame contains information about peptides.
    """
    peptides = diann_df.copy()
    columns = [
        col for col in peptides.columns if 'PG' not in col
        and 'Protein' not in col and 'Genes' not in col and 'GG' not in col
    ]
    columns.extend(['Genes'])

    peptides = diann_df[columns[2:]].copy()
    peptides['Length'] = peptides['Stripped.Sequence'].str.len()

    peptides.rename(columns={
        'MS2.Scan': 'MS/MS scan number',
        'Genes': 'Gene names',
        'Precursor.Charge': 'Charge',
        'Stripped.Sequence': 'Sequence'
    }, inplace=True)

    peptides['Sequence_AP_mod'] = peptides['Modified.Sequence'].apply(
        alphaviz.preprocessing.convert_diann_ap_mod
    )
    peptides['Modified.Sequence'] = peptides['Modified.Sequence'].apply(
        alphaviz.preprocessing.convert_diann_mq_mod
    )
    peptides['m/z'] = 0.0
    first_columns = [
        'Modified.Sequence', 'Length', 'm/z', 'RT',
        'Predicted.RT', 'Charge', 'IM', 'Predicted.IM'
    ]
    peptides = peptides[
        first_columns + sorted(list(
            set(peptides.columns).difference(first_columns))
        )
    ]
    return peptides


[docs]def import_diann_output(
    path_diann_output_folder: str,
    experiment: str,
    fasta: object
):
    """Load two files from the DiaNN output folder and returns the data frames containing information about proteins, peptides, and summary information about the whole experiment.

    Parameters
    ----------
    path_diann_output_folder : str
        Path to the DIANN output folder with all output files needed.
    experiment : str
        The name of the experiment.
    fasta : pyteomics.fasta.IndexedUniProt
        The object containing information about all proteins from the fasta file.

    Returns
    -------
    list of pd.DataFrames
        The function returns three pandas data frame with the extracted information about proteins, peptides, and summary information about the whole experiment.
    """
    diann_output_file, diann_stats_file = sorted(get_filenames_from_directory(
        path_diann_output_folder, 'tsv'), key=len)[:2]

    diann_df = pd.read_csv(os.path.join(path_diann_output_folder, diann_output_file), sep='\t', low_memory=False)
    diann_df = diann_df[diann_df.Run == experiment]

    diann_proteins = create_diann_proteins_table(diann_df, fasta)
    diann_peptides = create_diann_peptides_table(diann_df)

    diann_overview = import_diann_stats(os.path.join(path_diann_output_folder, diann_stats_file), experiment)

    return diann_proteins, diann_peptides, diann_overview, diann_output_file


[docs]def create_ap_proteins_table(
    ap_df: pd.DataFrame,
    fasta: object
):
    ap_df[['Protein names', 'Protein IDs', 'Gene names']] = ap_df.apply(
        lambda x: alphaviz.preprocessing.get_protein_info_from_fastaheader(
            x['protein_group']
        ), axis=1, result_type='expand'
    )
    ap_df[['Protein names', 'Sequence lengths']] = ap_df.apply(
        lambda x: alphaviz.preprocessing.get_protein_info(
            fasta, x['Protein IDs']
        ), axis=1, result_type='expand'
    )
    columns = [col for col in ap_df.columns if 'protein' in col] \
        + ['sequence', 'Protein names', 'Protein IDs',
            'Gene names', 'Sequence lengths']

    agg_dict = dict.fromkeys(columns, 'max')
    agg_dict['sequence'] = 'count'
    grouped_ap_df = ap_df.groupby(
        'index_protein_group',
        as_index=False
    )[columns].agg(agg_dict)

    grouped_ap_df.rename(
        columns={'sequence': '(EXP) # peptides'},
        inplace=True
    )
    grouped_ap_df['# proteins'] = grouped_ap_df['protein_idx'].apply(
        lambda x: len(x.split(',')))
    grouped_ap_df['# MS/MS'] = grouped_ap_df['(EXP) # peptides']
    first_columns = [
        'Protein IDs', 'Protein names', 'Gene names', '# proteins',
        '(EXP) # peptides', '# MS/MS', 'Sequence lengths'
    ]
    proteins = grouped_ap_df[
        first_columns + sorted(list(
            set(grouped_ap_df.columns).difference(first_columns))
        )
    ]
    return proteins


[docs]def create_ap_peptides_table(
    ap_df: pd.DataFrame
):
    peptides = ap_df.copy()
    columns = [
        col for col in peptides.columns if 'protein' not in col
        and 'Protein' not in col and col != 'Sequence lengths'
    ]
    peptides = peptides[columns]
    peptides.rename(columns={
        'n_AA': 'Length',
        'charge': 'Charge',
        'sequence_naked': 'Sequence',
        'parent': 'MS/MS scan number',
        'sequence': 'Sequence_AP_mod',
        'mz': 'm/z',
        'mass': 'Mass',
        'mobility': 'IM',
        'rt': 'RT',
    }, inplace=True)
    peptides['Modified.Sequence'] = peptides['Sequence_AP_mod']

    first_columns = [
        'Modified.Sequence', 'Length', 'm/z',
        'RT', 'Charge', 'Mass', 'IM'
    ]
    peptides = peptides[
        first_columns + sorted(list(
            set(peptides.columns).difference(first_columns))
        )
    ]
    return peptides


[docs]def import_alphapept_output(
    path_ap_output_folder: str,
    experiment: str,
    fasta: object
):
    ap_output_file = 'results_peptides.csv'
    ap_df = pd.read_csv(
        os.path.join(path_ap_output_folder, ap_output_file),
        low_memory=False
    )
    ap_df = ap_df[ap_df.shortname == experiment]
    cols_to_remove = ['filename', 'shortname', 'sample_group']
    ap_df.drop(columns=cols_to_remove, axis=1, inplace=True)
    ap_proteins = create_ap_proteins_table(ap_df, fasta)
    ap_peptides = create_ap_peptides_table(ap_df)

    return ap_proteins, ap_peptides