diff --git a/config/config_parser.py b/config/config_parser.py index 763476b..43916d6 100644 --- a/config/config_parser.py +++ b/config/config_parser.py @@ -1,8 +1,17 @@ +""" +config_parser.py +""" from configparser import ConfigParser import os def parse_config(filename, section='postgresql'): + """ + Parse database.ini file + :param filename: + :param section: + :return: + """ # create a parser parser = ConfigParser() # read config file diff --git a/config/database.py b/config/database.py index 4ab0f2f..3cc2fe4 100644 --- a/config/database.py +++ b/config/database.py @@ -1,3 +1,6 @@ +""" +sessions used by sqlalchemy +""" from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from config_parser import get_conn_str diff --git a/config/index.py b/config/index.py index b9929b7..44db643 100644 --- a/config/index.py +++ b/config/index.py @@ -1,8 +1,14 @@ +""" +index.py +This file contains helper function to get database session +""" from config.database import SessionLocal -# Helper function to get database session def get_session(): + """ + Helper function to get database session + """ session = SessionLocal() try: yield session diff --git a/models/analysiscollectionspectrumidentification.py b/models/analysiscollectionspectrumidentification.py index 64c4f6b..cb89240 100644 --- a/models/analysiscollectionspectrumidentification.py +++ b/models/analysiscollectionspectrumidentification.py @@ -1,3 +1,7 @@ +""" +This file contains the AnalysisCollectionSpectrumIdentification class, +which is a SQLAlchemy model for the analysiscollectionspectrumidentification table in the database. +""" from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, ForeignKeyConstraint, Integer, Any, JSON from models.base import Base diff --git a/models/base.py b/models/base.py index fa2b68a..e0fd8ca 100644 --- a/models/base.py +++ b/models/base.py @@ -1,3 +1,6 @@ +""" +Base class for all models. +""" from sqlalchemy.orm import DeclarativeBase diff --git a/models/dbsequence.py b/models/dbsequence.py index ef26be3..6270e45 100644 --- a/models/dbsequence.py +++ b/models/dbsequence.py @@ -1,3 +1,4 @@ +"""This file contains the DBSequence class, which is a SQLAlchemy model for the dbsequence table in the database.""" from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, Integer from models.base import Base diff --git a/models/enzyme.py b/models/enzyme.py index 2fa0f2e..deb8951 100644 --- a/models/enzyme.py +++ b/models/enzyme.py @@ -1,3 +1,4 @@ +"""This file contains the Enzyme class, which is a SQLAlchemy model for the enzyme table in the database.""" from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, BOOLEAN, ForeignKeyConstraint, Integer from models.base import Base diff --git a/models/match.py b/models/match.py index 640e33c..514332e 100644 --- a/models/match.py +++ b/models/match.py @@ -1,3 +1,4 @@ +"""This file contains the Match class, which is a SQLAlchemy model for the match table in the database.""" from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, FLOAT, JSON, BOOLEAN, Integer, ForeignKeyConstraint, CHAR, Index from models.base import Base diff --git a/models/modifiedpeptide.py b/models/modifiedpeptide.py index a6aa9a6..ce488e6 100644 --- a/models/modifiedpeptide.py +++ b/models/modifiedpeptide.py @@ -1,3 +1,7 @@ +""" +This file contains the ModifiedPeptide class, +which is a SQLAlchemy model for the modifiedpeptide table in the database. +""" from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, Integer, JSON, FLOAT, Index from models.base import Base diff --git a/models/peptideevidence.py b/models/peptideevidence.py index 10289e7..ca0af9c 100644 --- a/models/peptideevidence.py +++ b/models/peptideevidence.py @@ -1,3 +1,7 @@ +""" +This file contains the PeptideEvidence class, +which is a SQLAlchemy model for the peptideevidence table in the database. +""" from sqlalchemy.orm import Mapped, mapped_column from sqlalchemy import ForeignKey, Text, Integer, BOOLEAN, ForeignKeyConstraint, Index from models.base import Base diff --git a/models/spectradata.py b/models/spectradata.py index b3b8e40..6c1b188 100644 --- a/models/spectradata.py +++ b/models/spectradata.py @@ -1,5 +1,5 @@ from sqlalchemy.orm import Mapped, mapped_column -from sqlalchemy import ForeignKey, Text, Integer, UniqueConstraint +from sqlalchemy import ForeignKey, Text, Integer from models.base import Base diff --git a/parser/APIWriter.py b/parser/APIWriter.py index 045765c..fa77f68 100644 --- a/parser/APIWriter.py +++ b/parser/APIWriter.py @@ -1,3 +1,4 @@ +"""APIWriter.py - Class for writing results via an API.""" import traceback import requests import json @@ -117,6 +118,13 @@ def write_mzid_info(self, analysis_software_list, spectra_formats, return None def write_other_info(self, contains_crosslinks, upload_warnings, upload_id): + """ + Update Upload row with remaining info. + :param contains_crosslinks: + :param upload_warnings: + :param upload_id: + :return: + """ response = None try: # todo: use urljoin diff --git a/parser/DatabaseWriter.py b/parser/DatabaseWriter.py index 39953e4..242eefb 100644 --- a/parser/DatabaseWriter.py +++ b/parser/DatabaseWriter.py @@ -1,3 +1,4 @@ +"""DatabaseWriter class for writing results to a postgresql relational database.""" from sqlalchemy import create_engine, MetaData from sqlalchemy import Table @@ -42,6 +43,12 @@ def write_data(self, table, data): conn.close() def write_new_upload(self, table, data): + """ + Insert data into upload table and return the id of the new row. + :param table: + :param data: + :return: + """ table = Table(table, self.meta, autoload_with=self.engine, quote=False) with self.engine.connect() as conn: statement = table.insert().values(data).returning(table.columns[0]) # RETURNING id AS upload_id @@ -54,8 +61,6 @@ def write_mzid_info(self, analysis_software_list, spectra_formats, provider, audits, samples, bib, upload_id): """ Update Upload row with mzid info. - - ToDo: have this explicitly or create update func? :param analysis_software_list: (list) List of analysis software used. :param spectra_formats: :param provider: @@ -66,6 +71,7 @@ def write_mzid_info(self, analysis_software_list, spectra_formats, :return: """ upload = Table("upload", self.meta, autoload_with=self.engine, quote=False) + # noinspection PyTypeChecker stmt = upload.update().where(upload.c.id == str(upload_id)).values( analysis_software_list=analysis_software_list, spectra_formats=spectra_formats, @@ -90,6 +96,7 @@ def write_other_info(self, contains_crosslinks, upload_warnings, upload_id): """ upload = Table("upload", self.meta, autoload_with=self.engine, quote=False) with self.engine.connect() as conn: + # noinspection PyTypeChecker stmt = upload.update().where(upload.c.id == str(upload_id)).values( contains_crosslinks=contains_crosslinks, upload_warnings=upload_warnings, diff --git a/parser/MzIdParser.py b/parser/MzIdParser.py index d542dc1..d82fb46 100644 --- a/parser/MzIdParser.py +++ b/parser/MzIdParser.py @@ -1,3 +1,6 @@ +""" +converts mzIdentML files to DB entries +""" import base64 import gzip import json @@ -22,6 +25,7 @@ class MzIdParseException(Exception): + """Exception raised when parsing mzIdentML files.""" pass @@ -102,6 +106,10 @@ def parse(self): @staticmethod def check_spectra_data_validity(sp_datum): + """ + Check if the SpectraData element is valid. + :param sp_datum: + """ # is there anything we'd like to complain about? # SpectrumIDFormat if 'SpectrumIDFormat' not in sp_datum or sp_datum['SpectrumIDFormat'] is None: @@ -664,7 +672,7 @@ def main_loop(self): main_loop_start_time = time() self.logger.info('main loop - start') - msi_regex = re.compile(r'^([0-9]+)(?::(P|C))$') + msi_regex = re.compile(r'^([0-9]+):([PC])$') spec_count = 0 spectra = [] @@ -814,6 +822,7 @@ def main_loop(self): # noinspection PyBroadException def upload_info(self): + """write mzid file level info to the DB.""" upload_info_start_time = time() self.logger.info('parse upload info - start') self.mzid_reader.reset() @@ -864,6 +873,9 @@ def upload_info(self): round(time() - upload_info_start_time, 2))) def fill_in_missing_scores(self): + """ + Legacy xiSPEC, ignore + """ pass def write_new_upload(self): @@ -941,6 +953,11 @@ def get_cv_params(self, element, super_cls_accession=None): # split into two functions @staticmethod def extract_mzid(archive): + """ + Extract the files from the archive. + :param archive: + :return: + """ if archive.endswith('zip'): zip_ref = zipfile.ZipFile(archive, 'r') unzip_path = archive + '_unzip/' @@ -1043,7 +1060,7 @@ def write_new_upload(self): } table = 'upload' - response = self.writer.write_data(table, upload_data) + self.writer.write_data(table, upload_data) except SQLAlchemyError as e: print(f"Error during database insert: {e}") diff --git a/parser/NumpyEncoder.py b/parser/NumpyEncoder.py deleted file mode 100644 index cb69815..0000000 --- a/parser/NumpyEncoder.py +++ /dev/null @@ -1,17 +0,0 @@ -import json - - -class NumpyEncoder(json.JSONEncoder): - # def default(self, obj): - # if isinstance(obj, np.ndarray): - # return obj.tolist() - # return json.JSONEncoder.default(self, obj) - def default(self, o): - try: - iterable = iter(o) - except TypeError: - pass - else: - return list(iterable) - # Let the base class default method raise the TypeError - return json.JSONEncoder.default(self, o) diff --git a/parser/SimpleFASTA.py b/parser/SimpleFASTA.py index e7083e1..3378c43 100644 --- a/parser/SimpleFASTA.py +++ b/parser/SimpleFASTA.py @@ -1,3 +1,4 @@ +"""SimpleFASTA.py - Parse a FASTA file and return a dictionary of the sequences.""" import re @@ -5,6 +6,11 @@ # noinspection PyUnusedLocal def get_db_sequence_dict(fasta_file_list): + """ + Parse a FASTA file and return a dictionary of the sequences. + :param fasta_file_list: + :return: dict + """ db_sequence_dict = {} identifier = None sequence = "" @@ -39,6 +45,14 @@ def get_db_sequence_dict(fasta_file_list): def add_entry(identifier, sequence, description, seq_dict): + """ + Add an entry to the sequence dictionary. + :param identifier: + :param sequence: + :param description: + :param seq_dict: + :return: None + """ m = re.search(r'..\|(.*)\|(.*)\s?', identifier) # id = identifier accession = identifier diff --git a/parser/Writer.py b/parser/Writer.py index 0001720..71ea36e 100644 --- a/parser/Writer.py +++ b/parser/Writer.py @@ -1,26 +1,54 @@ +"""Writer.py - Abstract class for writing results to a database.""" from abc import ABC, abstractmethod # Strategy interface class Writer(ABC): - + """ + Interface for writing results to a database. + """ def __init__(self, upload_id=None, pxid=None): self.pxid = pxid self.upload_id = upload_id @abstractmethod def write_data(self, table, data): + """ + Insert data into table. + :param table: + :param data: + """ pass @abstractmethod def write_new_upload(self, table, data): + """ + Insert data into upload table and, if postgres, return the id of the new row. + :param table: + :param data: + """ pass @abstractmethod def write_mzid_info(self, analysis_software_list, spectra_formats, provider, audits, samples, bib, upload_id): + """ + Update the mzid_info table with the given data. + :param analysis_software_list: + :param spectra_formats: + :param provider: + :param audits: + :param samples: + :param bib: + :param upload_id: + """ pass @abstractmethod def fill_in_missing_scores(self): + """ + Legacy xiSPEC thing, can be ignored, + just leaving in rather than creating a backwards compatibility issue for xiSPEC + todo - probably remove + """ pass diff --git a/parser/__init__.py b/parser/__init__.py index 52b7fef..f739656 100644 --- a/parser/__init__.py +++ b/parser/__init__.py @@ -1,3 +1,4 @@ +"""parser module""" from .csv_parser.FullCsvParser import * from .csv_parser.LinksOnlyCsvParser import * from .csv_parser.NoPeakListsCsvParser import * diff --git a/parser/csv_parser/AbstractCsvParser.py b/parser/csv_parser/AbstractCsvParser.py index 62bd55b..cc87729 100644 --- a/parser/csv_parser/AbstractCsvParser.py +++ b/parser/csv_parser/AbstractCsvParser.py @@ -1,3 +1,4 @@ +"""Abstract class for csv parsers.""" import abc import os from time import time @@ -18,22 +19,34 @@ class CsvParseException(Exception): class MissingFileException(Exception): + """ + Exception raised for missing files. + todo - reuse other exception? + """ pass class AbstractCsvParser(abc.ABC): """ - + Abstract class for csv parsers. """ @property @abc.abstractmethod def required_cols(self): + """ + Get required column names in csv file. + :return: list of strings + """ pass @property @abc.abstractmethod def optional_cols(self): + """ + Get optional column names in csv file. + :return: list of strings + """ pass default_values = { @@ -129,12 +142,22 @@ def __init__(self, csv_path, temp_dir, peak_list_dir, writer, logger): # self.csv_reader.fillna('Null', inplace=True) def check_required_columns(self): + """ + Check if all required columns are present in the csv file. + todo - return type / raising exception is not consistent + :return: bool + :raises CsvParseException: if a required column is missing + """ for required_col in self.required_cols: if required_col not in self.csv_reader.columns: raise CsvParseException("Required csv column %s missing" % required_col) return True def get_missing_required_columns(self): + """ + Get missing required columns in the csv file. + :return: list of strings + """ missing_cols = [] for required_col in self.required_cols: if required_col not in self.csv_reader.columns: @@ -149,6 +172,9 @@ def get_peak_list_file_names(self): return self.csv_reader.peaklistfilename.unique() def get_sequence_db_file_names(self): + """ + :return: list of all used sequence db file names + """ fasta_files = [] for file in os.listdir(self.temp_dir): if file.endswith(".fasta") or file.endswith(".FASTA"): @@ -202,7 +228,9 @@ def set_peak_list_readers(self): self.peak_list_readers = peak_list_readers def parse(self): - + """ + Parse csv file. + """ start_time = time() # ToDo: more gracefully handle missing files @@ -224,6 +252,9 @@ def parse(self): @abc.abstractmethod def main_loop(self): + """ + Main loop for parsing the csv. + """ pass # @staticmethod @@ -243,12 +274,18 @@ def main_loop(self): # return masses def parse_db_sequences(self): + """ + Parse db sequences. + """ self.logger.info('reading fasta - start') self.start_time = time() self.fasta = SimpleFASTA.get_db_sequence_dict(self.get_sequence_db_file_names()) self.logger.info('reading fasta - done. Time: ' + str(round(time() - self.start_time, 2)) + " sec") def upload_info(self): + """ + Write new upload to database. + """ self.logger.info('new csv upload') # # ident_file_size = os.path.getsize(self.csv_path) # # peak_list_file_names = json.dumps(self.get_peak_list_file_names(), cls=NumpyEncoder) @@ -260,7 +297,9 @@ def upload_info(self): # self.writer.write_mzid_info(spectra_formats, provider, audits, samples, bib_refs) def write_new_upload(self): - """Write new upload.""" + """Write new upload todatabase. + :raises Exception: if there is an error writing to the database. + """ upload_data = { # 'id': self.writer.upload_id, # 'user_id': self.writer.user_id, diff --git a/parser/csv_parser/FullCsvParser.py b/parser/csv_parser/FullCsvParser.py index 68a1018..ce45483 100644 --- a/parser/csv_parser/FullCsvParser.py +++ b/parser/csv_parser/FullCsvParser.py @@ -1,3 +1,4 @@ +"""""" from .AbstractCsvParser import AbstractCsvParser, CsvParseException from time import time import re diff --git a/parser/database/create_db_schema.py b/parser/database/create_db_schema.py index 76f2509..bd1e044 100644 --- a/parser/database/create_db_schema.py +++ b/parser/database/create_db_schema.py @@ -1,5 +1,7 @@ -import logging.config - +""" +create_db_schema.py +This script creates a database and schema for the application. +""" from sqlalchemy import create_engine from sqlalchemy_utils import database_exists, drop_database, create_database @@ -9,20 +11,34 @@ def create_db(connection_str): + """ + Create a database if it doesn't exist. + :param connection_str: + :return: None + """ engine = create_engine(connection_str) if not database_exists(engine.url): create_database(engine.url) def drop_db(connection_str): + """ + Drop a database if it exists. + :param connection_str: + :return: None + """ engine = create_engine(connection_str) drop_database(engine.url) def create_schema(connection_str): + """ + Create schema for the database. + :param connection_str: + :return: None + """ engine = create_engine(connection_str) # , echo=True) Base.metadata.create_all(engine) - # logging.info(Base.metadata.tables) engine.dispose() diff --git a/parser/database/guid.py b/parser/database/guid.py deleted file mode 100644 index ddcce63..0000000 --- a/parser/database/guid.py +++ /dev/null @@ -1,39 +0,0 @@ -from sqlalchemy.types import TypeDecorator, CHAR -from sqlalchemy.dialects.postgresql import UUID -import uuid - - -class GUID(TypeDecorator): - """Platform-independent GUID type. - - Uses PostgreSQL's UUID type, otherwise uses - CHAR(32), storing as stringified hex values. - """ - impl = CHAR - cache_ok = True - - def load_dialect_impl(self, dialect): - if dialect.name == 'postgresql': - return dialect.type_descriptor(UUID()) - else: - return dialect.type_descriptor(CHAR(32)) - - def process_bind_param(self, value, dialect): - if value is None: - return value - elif dialect.name == 'postgresql': - return str(value) - else: - if not isinstance(value, uuid.UUID): - return "%.32x" % uuid.UUID(value).int - else: - # hexstring - return "%.32x" % value.int - - def process_result_value(self, value, dialect): - if value is None: - return value - else: - if not isinstance(value, uuid.UUID): - value = uuid.UUID(value) - return value diff --git a/parser/peaklistReader/PeakListWrapper.py b/parser/peaklistReader/PeakListWrapper.py index c505562..fb46085 100644 --- a/parser/peaklistReader/PeakListWrapper.py +++ b/parser/peaklistReader/PeakListWrapper.py @@ -1,3 +1,6 @@ +""" +PeakListWrapper.py +""" import ntpath import zipfile import re @@ -9,20 +12,26 @@ import io import tarfile - +#todo -check error handling class PeakListParseError(Exception): + """raised if error reading peaklist, invalid spectrum id or spectrum not found in peaklist file.""" pass class SpectrumIdFormatError(Exception): + """raised if the spectrum id format is not supported by the reader.""" pass class ScanNotFoundException(Exception): + """raised if the scan is not found in the mzML file.""" pass class Spectrum: + """ + A class to represent a spectrum. + """ def __init__(self, precursor, mz_array, int_array, rt=np.nan): """ Initialise a Spectrum object. @@ -45,6 +54,9 @@ def __init__(self, precursor, mz_array, int_array, rt=np.nan): class PeakListWrapper: + """ + A class to wrap peak list files and provide an interface to access the spectra. + """ def __init__(self, pl_path, file_format_accession, spectrum_id_format_accession): self.file_format_accession = file_format_accession self.spectrum_id_format_accession = spectrum_id_format_accession @@ -71,16 +83,31 @@ def __getitem__(self, spec_id): return self.reader[spec_id] def is_mgf(self): + """ + Check if the peak list is in MGF format. + :return: bbol + """ return self.file_format_accession == 'MS:1001062' def is_mzml(self): + """ + Check if the peak list is in mzML format. + :return: bool + """ return self.file_format_accession == 'MS:1000584' def is_ms2(self): + """ + Check if the peak list is in MS2 format. + :return: bool + """ return self.file_format_accession == 'MS:1001466' @staticmethod def extract_gz(in_file): + """ + Extract gzipped file. + """ if in_file.endswith('.gz'): in_f = gzip.open(in_file, 'rb') in_file = in_file.replace(".gz", "") @@ -101,7 +128,7 @@ def unzip_peak_lists(zip_file, out_path='.'): :param zip_file: path to archive to unzip :param out_path: where to extract the files - :return: resulting folder + :return: path to resulting folder """ if zip_file.endswith(".zip"): zip_ref = zipfile.ZipFile(zip_file, 'r') diff --git a/parser/peaklistReader/__init__.py b/parser/peaklistReader/__init__.py index e69de29..bd3906c 100644 --- a/parser/peaklistReader/__init__.py +++ b/parser/peaklistReader/__init__.py @@ -0,0 +1 @@ +"""peaklistReader module""" \ No newline at end of file diff --git a/parser/process_dataset.py b/parser/process_dataset.py index de41f1e..8a522ca 100644 --- a/parser/process_dataset.py +++ b/parser/process_dataset.py @@ -108,17 +108,11 @@ def validate(validate_arg, tmpdir): This includes checking that Seq elements are present for target proteins, even though omitting them is technically valid. Prints out results. - Parameters - ---------- - validate_arg : str + :param validate_arg: str The path to the mzIdentML file or directory to be validated. - tmpdir : str - The temporary directory to use for validation - an Sqlite DB is created here if given, - otherwise an in-memory sqlite DB is used. - - Returns - ------- - None + :param tmpdir: str + The temporary directory to use for validation - an Sqlite DB is created here. + :return: None """ if os.path.isdir(validate_arg): print(f'Validating directory: {validate_arg}') @@ -348,7 +342,7 @@ def convert_from_ftp(ftp_url, temp_dir, project_identifier, writer_method, dontd ftp = get_ftp_login(ftp_ip) try: ftp.cwd(urlparse(ftp_url).path) - ftp.retrbinary(f"RETR {f}", open(os.path.join(path, f), 'wb').write) + ftp.retrbinary(f"RETR {f}", open(os.path.join(str(path), f), 'wb').write) ftp.quit() except ftplib.error_perm as e: ftp.quit() diff --git a/parser/schema_validate.py b/parser/schema_validate.py index d0331c0..7bead20 100644 --- a/parser/schema_validate.py +++ b/parser/schema_validate.py @@ -1,9 +1,14 @@ +"""schema_validate.py - Validate an mzIdentML file against 1.2.0 or 1.3.0 schema.""" import importlib -import os from lxml import etree def schema_validate(xml_file): + """ + Validate an mzIdentML file against 1.2.0 or 1.3.0 schema. + :param xml_file: Path to the mzIdentML file. + :return: True if the XML is valid, False otherwise. + """ # Parse the XML file with open(xml_file, 'r') as xml: xml_doc = etree.parse(xml) @@ -44,8 +49,8 @@ def schema_validate(xml_file): # # read from scehma directory # schema_file = os.path.join(current_directory, '..', 'schema', schema_fname) # # Parse the XSD file - with open(schema_file, 'r') as schema_file: - schema_root = etree.XML(schema_file.read()) + with open(schema_file, 'r') as schema_file_stream: + schema_root = etree.XML(schema_file_stream.read()) schema = etree.XMLSchema(schema_root) # Validate XML against the schema