Source code for autoprot.preprocessing.data_handling

# -*- coding: utf-8 -*-
"""
Autoprot Preprocessing Functions.

@author: Wignand, Julian, Johannes

@documentation: Julian
"""

import pandas as pd
import os
import requests
from urllib import parse
from ftplib import FTP



[docs]
def read_csv(file, sep="\t", low_memory=False, **kwargs):
    r"""
    pd.read_csv with modified default args.

    Parameters
    ----------
    file : str
        Path to input file.
    sep : str, optional
        Column separator. The default is '\t'.
    low_memory : bool, optional
        Whether to reduce memory consumption by inferring dtypes from chunks. The default is False.
    **kwargs :
        see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.

    Returns
    -------
    pd.DataFrame
        The parsed dataframe.

    """
    return pd.read_csv(file, sep=sep, low_memory=low_memory, **kwargs)




[docs]
def to_csv(df, file, sep="\t", index=False, **kwargs):
    r"""
    Write to CSV file.

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe to write.
    file : str
        Path to output file.
    sep : str, optional
        Column separator. The default is '\t'.
    index : bool, optional
        Whether to add the dataframe index to the output. The default is False.
    **kwargs :
        see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html.

    Returns
    -------
    None.

    """
    df.to_csv(file, sep=sep, index=index, **kwargs)




[docs]
def download_from_ftp(url, save_dir, login_name="anonymous", login_pw=""):
    r"""
    Download a file from FTP.

    Parameters
    ----------
    url : TYPE
        DESCRIPTION.
    save_dir : TYPE
        DESCRIPTION.
    login_name : str
        Login name for the FTP server.
        Default is 'anonymous' working for the PRIDE FTP server.
    login_pw : str
        Password for access to the FTP server.
        Default is ''
    Returns
    -------
    str
        Path to the downloaded file.

    Examples
    --------
    Download all files from a dictionary holding file names and ftp links and
    save the paths to the downloaded files in a list.

    >>> downloadedFiles = []
    >>> for file in ftpDict.keys():
    ...     downloadedFiles.append(pp.download_from_ftp(ftpDict[file], r'C:\Users\jbender\Documents\python_playground'))

    """
    path, file = os.path.split(parse.urlparse(url).path)
    ftp = FTP(parse.urlparse(url).netloc)
    ftp.login(login_name, login_pw)
    ftp.cwd(path)
    ftp.retrbinary("RETR " + file, open(os.path.join(save_dir, file), "wb").write)
    print(f"Downloaded {file}")
    ftp.quit()
    return os.path.join(save_dir, file)




[docs]
def fetch_from_pride(accession, term, ignore_caps=True):
    """
    Get download links files belonging to a PRIDE identifier.

    Parameters
    ----------
    accession : str
        PRIDE identifier.
    term : str
        Part of the filename belonging to the project.
        For example 'proteingroups'
    ignore_caps : bool, optional
        Whether to ignore capitalisation during matching of terms.
        The default is True.

    Returns
    -------
    file_locs : dict
        Dict mapping filenames to FTP download links.

    Examples
    --------
    Generate a dict mapping file names to ftp download links.
    Not that only files containing the string proteingroups are retrieved.

    >>> ftpDict = pp.fetch_from_pride("PXD031829", 'proteingroups')

    """
    js_list = requests.get(
        f"https://www.ebi.ac.uk/pride/ws/archive/v2/files/byProject?accession={accession}",
        headers={"Accept": "application/json"},
    ).json()

    file_locs = {}

    for fdict in js_list:
        fname = fdict["fileName"]
        if ignore_caps is True:
            fname = fname.lower()
            term = term.lower()
        if term in fname:
            for protocol in fdict["publicFileLocations"]:
                if protocol["name"] == "FTP Protocol":
                    file_locs[fname] = protocol["value"]
                    print(f"Found file {fname}")
    return file_locs