Source code for autoprot.preprocessing.data_handling

# -*- coding: utf-8 -*-
"""
Autoprot Preprocessing Functions.

@author: Wignand, Julian, Johannes

@documentation: Julian
"""

import pandas as pd
import os
import requests
from urllib import parse
from ftplib import FTP


[docs] def read_csv(file, sep='\t', low_memory=False, **kwargs): r""" pd.read_csv with modified default args. Parameters ---------- file : str Path to input file. sep : str, optional Column separator. The default is '\t'. low_memory : bool, optional Whether to reduce memory consumption by inferring dtypes from chunks. The default is False. **kwargs : see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html. Returns ------- pd.DataFrame The parsed dataframe. """ return pd.read_csv(file, sep=sep, low_memory=low_memory, **kwargs)
[docs] def to_csv(df, file, sep='\t', index=False, **kwargs): r""" Write to CSV file. Parameters ---------- df : pd.DataFrame Dataframe to write. file : str Path to output file. sep : str, optional Column separator. The default is '\t'. index : bool, optional Whether to add the dataframe index to the output. The default is False. **kwargs : see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html. Returns ------- None. """ df.to_csv(file, sep=sep, index=index, **kwargs)
[docs] def download_from_ftp(url, save_dir, login_name='anonymous', login_pw=''): r""" Download a file from FTP. Parameters ---------- url : TYPE DESCRIPTION. save_dir : TYPE DESCRIPTION. login_name : str Login name for the FTP server. Default is 'anonymous' working for the PRIDE FTP server. login_pw : str Password for access to the FTP server. Default is '' Returns ------- str Path to the downloaded file. Examples -------- Download all files from a dictionary holding file names and ftp links and save the paths to the downloaded files in a list. >>> downloadedFiles = [] >>> for file in ftpDict.keys(): ... downloadedFiles.append(pp.download_from_ftp(ftpDict[file], r'C:\Users\jbender\Documents\python_playground')) """ path, file = os.path.split(parse.urlparse(url).path) ftp = FTP(parse.urlparse(url).netloc) ftp.login(login_name, login_pw) ftp.cwd(path) ftp.retrbinary("RETR " + file, open(os.path.join(save_dir, file), 'wb').write) print(f'Downloaded {file}') ftp.quit() return os.path.join(save_dir, file)
[docs] def fetch_from_pride(accession, term, ignore_caps=True): """ Get download links files belonging to a PRIDE identifier. Parameters ---------- accession : str PRIDE identifier. term : str Part of the filename belonging to the project. For example 'proteingroups' ignore_caps : bool, optional Whether to ignore capitalisation during matching of terms. The default is True. Returns ------- file_locs : dict Dict mapping filenames to FTP download links. Examples -------- Generate a dict mapping file names to ftp download links. Not that only files containing the string proteingroups are retrieved. >>> ftpDict = pp.fetch_from_pride("PXD031829", 'proteingroups') """ js_list = requests.get(f'https://www.ebi.ac.uk/pride/ws/archive/v2/files/byProject?accession={accession}', headers={'Accept': 'application/json'}).json() file_locs = {} for fdict in js_list: fname = fdict['fileName'] if ignore_caps is True: fname = fname.lower() term = term.lower() if term in fname: for protocol in fdict['publicFileLocations']: if protocol['name'] == 'FTP Protocol': file_locs[fname] = protocol['value'] print(f'Found file {fname}') return file_locs