From 79dc5c3550b5ce5a9f20f18ddc30e3c620ba5067 Mon Sep 17 00:00:00 2001 From: colin combe Date: Thu, 25 Jul 2024 08:14:45 +0100 Subject: [PATCH 1/2] fix foriegn key constraints, assign integer id to SpectraData (i.e. the peaklist info) --- models/__init__.py | 3 +- ...nalysiscollectionspectrumidentification.py | 15 ++- models/enzyme.py | 8 +- models/match.py | 35 +++---- models/peptideevidence.py | 16 ++-- models/searchmodification.py | 8 +- models/spectradata.py | 20 ++++ parser/MzIdParser.py | 96 ++++++++++++------- parser/database/create_db_schema.py | 2 +- tests/test_MzIdParser_ecoli_dsso.py | 10 +- 10 files changed, 137 insertions(+), 76 deletions(-) create mode 100644 models/spectradata.py diff --git a/models/__init__.py b/models/__init__.py index b67fe0d..0a3836a 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -8,13 +8,14 @@ "base", "dbsequence", "enzyme", + "match", "modifiedpeptide", "peptideevidence", "projectdetail", "projectsubdetail", "searchmodification", + "spectradata", "spectrum", - "match", "spectrumidentificationprotocol", "upload" ] diff --git a/models/analysiscollectionspectrumidentification.py b/models/analysiscollectionspectrumidentification.py index 04b10b7..2e608e1 100644 --- a/models/analysiscollectionspectrumidentification.py +++ b/models/analysiscollectionspectrumidentification.py @@ -1,6 +1,7 @@ -from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy import ForeignKey, Text, ForeignKeyConstraint, Integer, Any, JSON from models.base import Base +from models.match import Match class AnalysisCollectionSpectrumIdentification(Base): @@ -14,9 +15,15 @@ class AnalysisCollectionSpectrumIdentification(Base): spectrum_identification_list_ref: Mapped[str] = mapped_column(Text, primary_key=True, nullable=False) spectrum_identification_protocol_ref: Mapped[str] = mapped_column(Text, primary_key=False, nullable=False) spectrum_identification_id: Mapped[str] = mapped_column(Text, primary_key=False, nullable=False) + # # actvity date as time ? + # activity_date: + # # name ? + # name: spectra_data_refs: Mapped[dict[str, Any]] = mapped_column(JSON, primary_key=False, nullable=True) search_database_refs: Mapped[dict[str, Any]] = mapped_column(JSON, primary_key=False, nullable=True) - ForeignKeyConstraint( - ["spectrum_identification_protocol_ref", "upload_id"], - ["spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"], + __table_args__ = ( + ForeignKeyConstraint( + ["spectrum_identification_protocol_ref", "upload_id"], + ["spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"], + ), ) diff --git a/models/enzyme.py b/models/enzyme.py index 4440280..8debedf 100644 --- a/models/enzyme.py +++ b/models/enzyme.py @@ -16,7 +16,9 @@ class Enzyme(Base): semi_specific: Mapped[bool] = mapped_column(BOOLEAN, nullable=True) site_regexp: Mapped[str] = mapped_column(Text, nullable=True) accession: Mapped[str] = mapped_column(Text, nullable=True) - ForeignKeyConstraint( - ("protocol_id", "upload_id"), - ("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"), + __table_args__ = ( + ForeignKeyConstraint( + ("protocol_id", "upload_id"), + ("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"), + ), ) diff --git a/models/match.py b/models/match.py index 73091e8..dbe81f0 100644 --- a/models/match.py +++ b/models/match.py @@ -1,4 +1,4 @@ -from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy import ForeignKey, Text, FLOAT, JSON, BOOLEAN, Integer, ForeignKeyConstraint, CHAR from models.base import Base from typing import Optional, Any @@ -10,7 +10,7 @@ class Match(Base): upload_id: Mapped[int] = mapped_column(Integer, ForeignKey("upload.id"), index=True, primary_key=True, nullable=False) spectrum_id: Mapped[str] = mapped_column(Text, nullable=True) - spectra_data_ref: Mapped[str] = mapped_column(Text, nullable=True) + spectra_data_id: Mapped[int] = mapped_column(Integer, nullable=True) # nullable for csv data multiple_spectra_identification_id: Mapped[str] = mapped_column(Integer, nullable=True) multiple_spectra_identification_pc: Mapped[str] = mapped_column(CHAR, nullable=True) pep1_id: Mapped[str] = mapped_column(Text, nullable=False) @@ -22,22 +22,25 @@ class Match(Base): exp_mz: Mapped[float] = mapped_column(FLOAT, nullable=True) calc_mz: Mapped[float] = mapped_column(FLOAT, nullable=True) sil_id: Mapped[str] = mapped_column(Text, nullable=True) # null if from csv file - ForeignKeyConstraint( - ["sil_id", "upload_id"], - ["analysiscollectionspectrumidentification.spectrum_identification_list_ref", - "analysiscollectionspectrumidentification.upload_id"], - ), + __table_args__ = ( + ForeignKeyConstraint( + ["sil_id", "upload_id"], + ["analysiscollectionspectrumidentification.spectrum_identification_list_ref", + "analysiscollectionspectrumidentification.upload_id"], + ), + ForeignKeyConstraint( + ["pep1_id", "upload_id"], + ["modifiedpeptide.id", "modifiedpeptide.upload_id"], + ), + ForeignKeyConstraint( + ["pep2_id", "upload_id"], + ["modifiedpeptide.id", "modifiedpeptide.upload_id"], + ) + ) + # Can't use this ForeignKeyConstraint, because we want to allow people to upload data # without spectra # ForeignKeyConstraint( # ["spectrum_id", "spectra_data_ref", "upload_id"], # ["Spectrum.id", "Spectrum.spectra_data_ref", "Spectrum.upload_id"], - # ), - ForeignKeyConstraint( - ["pep1_id", "upload_id"], - ["modifiedpeptide.id", "modifiedpeptide.upload_id"], - ), - ForeignKeyConstraint( - ["pep2_id", "upload_id"], - ["modifiedpeptide.id", "modifiedpeptide.upload_id"], - ) + # ), \ No newline at end of file diff --git a/models/peptideevidence.py b/models/peptideevidence.py index 99c7987..17126b1 100644 --- a/models/peptideevidence.py +++ b/models/peptideevidence.py @@ -11,11 +11,13 @@ class PeptideEvidence(Base): dbsequence_ref: Mapped[str] = mapped_column(Text, primary_key=True, nullable=False) pep_start: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) is_decoy: Mapped[bool] = mapped_column(BOOLEAN, nullable=True) - ForeignKeyConstraint( - ("dbsequence_ref", "upload_id"), - ("dbsequence.id", "dbsequence.upload_id"), - ) - ForeignKeyConstraint( - ("peptide_ref", "upload_id"), - ("modifiedpeptide.id", "modifiedpeptide.upload_id"), + __table_args__ = ( + ForeignKeyConstraint( + ("dbsequence_ref", "upload_id"), + ("dbsequence.id", "dbsequence.upload_id"), + ), + ForeignKeyConstraint( + ("peptide_ref", "upload_id"), + ("modifiedpeptide.id", "modifiedpeptide.upload_id"), + ) ) diff --git a/models/searchmodification.py b/models/searchmodification.py index cff93e6..8f82bea 100644 --- a/models/searchmodification.py +++ b/models/searchmodification.py @@ -16,7 +16,9 @@ class SearchModification(Base): fixed_mod: Mapped[bool] = mapped_column(BOOLEAN, nullable=False) accession: Mapped[str] = mapped_column(Text, nullable=True) crosslinker_id: Mapped[str] = mapped_column(Text, nullable=True) - ForeignKeyConstraint( - ("protocol_id", "upload_id"), - ("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"), + __table_args__ = ( + ForeignKeyConstraint( + ("protocol_id", "upload_id"), + ("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"), + ), ) diff --git a/models/spectradata.py b/models/spectradata.py new file mode 100644 index 0000000..356e304 --- /dev/null +++ b/models/spectradata.py @@ -0,0 +1,20 @@ +from sqlalchemy.orm import Mapped, mapped_column +from sqlalchemy import ForeignKey, Text, Integer, UniqueConstraint +from models.base import Base + + +class SpectraData(Base): + __tablename__ = "spectradata" + id: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + upload_id: Mapped[int] = mapped_column(Integer, ForeignKey("upload.id"), primary_key=True, + index=True, nullable=False) + spectra_data_ref: Mapped[str] = mapped_column(Text, nullable=False) + location: Mapped[str] = mapped_column(Text, nullable=False) + name: Mapped[str] = mapped_column(Text, nullable=True) + external_format_documentation: Mapped[str] = mapped_column(Text, nullable=True) + file_format: Mapped[str] = mapped_column(Text, nullable=False) + spectrum_id_format: Mapped[str] = mapped_column(Text, nullable=False) + + __table_args__ = ( + UniqueConstraint("spectra_data_ref", "upload_id"), + ) diff --git a/parser/MzIdParser.py b/parser/MzIdParser.py index 097cbe1..a47d621 100644 --- a/parser/MzIdParser.py +++ b/parser/MzIdParser.py @@ -43,6 +43,7 @@ def __init__(self, mzid_path, temp_dir, peak_list_dir, writer, logger): self.mzid_path = mzid_path self.peak_list_readers = {} # peak list readers indexed by spectraData_ref + self.spectra_data_id_lookup = {} # spectra_data_ref to spectra_data_id lookup self.temp_dir = temp_dir if not self.temp_dir.endswith('/'): self.temp_dir += '/' @@ -81,8 +82,7 @@ def parse(self): start_time = time() self.upload_info() # overridden (empty function) in xiSPEC subclass self.parse_analysis_protocol_collection() - if self.peak_list_dir: - self.init_peak_list_readers() + self.parse_spectradata_and_init_peak_list_readers() self.parse_analysis_collection() self.parse_db_sequences() # overridden (empty function) in xiSPEC subclass self.parse_peptides() @@ -117,7 +117,7 @@ def check_spectra_data_validity(sp_datum): if 'location' not in sp_datum or sp_datum['location'] is None: raise MzIdParseException('SpectraData is missing location') - def init_peak_list_readers(self): + def parse_spectradata_and_init_peak_list_readers(self): """ Sets self.peak_list_readers by looping through SpectraData elements @@ -126,53 +126,76 @@ def init_peak_list_readers(self): value: associated peak_list_reader """ peak_list_readers = {} + spectra_data = [] + spectra_data_id_lookup = {} + sd_int_id = 0 for spectra_data_id in self.mzid_reader._offset_index["SpectraData"].keys(): sp_datum = self.mzid_reader.get_by_id(spectra_data_id, tag_id='SpectraData') self.check_spectra_data_validity(sp_datum) - sd_id = sp_datum['id'] peak_list_file_name = ntpath.basename(sp_datum['location']) - peak_list_file_path = self.peak_list_dir + peak_list_file_name + file_format = sp_datum['FileFormat'].accession + spectrum_id_format = sp_datum['SpectrumIDFormat'].accession - # noinspection PyBroadException - try: - peak_list_reader = PeakListWrapper( - peak_list_file_path, - sp_datum['FileFormat'].accession, - sp_datum['SpectrumIDFormat'].accession - ) - # ToDo: gz/zip code parts could do with refactoring - except Exception: - # try gz version + if self.peak_list_dir: + peak_list_file_path = self.peak_list_dir + peak_list_file_name + # noinspection PyBroadException try: peak_list_reader = PeakListWrapper( - PeakListWrapper.extract_gz(peak_list_file_path + '.gz'), - sp_datum['FileFormat'].accession, - sp_datum['SpectrumIDFormat'].accession + peak_list_file_path, + file_format, + spectrum_id_format ) - except IOError: - # look for missing peak lists in zip files - for file in os.listdir(self.peak_list_dir): - if file.endswith(".zip"): - zip_file = os.path.join(self.peak_list_dir, file) - try: - with zipfile.ZipFile(zip_file, 'r') as zip_ref: - zip_ref.extractall(self.peak_list_dir) - except IOError: - raise IOError() + # ToDo: gz/zip code parts could do with refactoring + except Exception: + # try gz version try: peak_list_reader = PeakListWrapper( - peak_list_file_path, - sp_datum['FileFormat'].accession, - sp_datum['SpectrumIDFormat'].accession + PeakListWrapper.extract_gz(peak_list_file_path + '.gz'), + file_format, + spectrum_id_format ) - except Exception: - raise MzIdParseException('Missing peak list file: %s' % peak_list_file_path) + except IOError: + # look for missing peak lists in zip files + for file in os.listdir(self.peak_list_dir): + if file.endswith(".zip"): + zip_file = os.path.join(self.peak_list_dir, file) + try: + with zipfile.ZipFile(zip_file, 'r') as zip_ref: + zip_ref.extractall(self.peak_list_dir) + except IOError: + raise IOError() + try: + peak_list_reader = PeakListWrapper( + peak_list_file_path, + file_format, + spectrum_id_format + ) + except Exception: + raise MzIdParseException('Missing peak list file: %s' % peak_list_file_path) + + peak_list_readers[spectra_data_id] = peak_list_reader + + spectra_datum = { + 'id': sd_int_id, + 'upload_id': self.writer.upload_id, + 'spectra_data_ref': spectra_data_id, + 'location': sp_datum['location'], + 'name': sp_datum.get('name', None), + 'external_format_documentation': sp_datum.get('externalFormatDocumentation', None), + 'file_format': file_format, + 'spectrum_id_format': spectrum_id_format + } + + spectra_data.append(spectra_datum) + spectra_data_id_lookup[spectra_data_id] = sd_int_id + sd_int_id += 1 - peak_list_readers[sd_id] = peak_list_reader + self.writer.write_data('spectradata', spectra_data) self.peak_list_readers = peak_list_readers + self.spectra_data_id_lookup = spectra_data_id_lookup def parse_analysis_protocol_collection(self): """Parse the AnalysisProtocolCollection and write SpectrumIdentificationProtocols.""" @@ -453,8 +476,6 @@ def parse_peptides(self): peptide_index = 0 peptides = [] for pep_id in self.mzid_reader._offset_index["Peptide"].keys(): - if pep_id == 'peptide_1497_2_p1': - pass peptide = self.mzid_reader.get_by_id(pep_id, tag_id='Peptide') donor_link_site = None acc_link_site = None @@ -705,11 +726,12 @@ def main_loop(self): msi_id = m[1] msi_pc = m[2] + sd_int_id = self.spectra_data_id_lookup[sid_result['spectraData_ref']] ident_data = { 'id': spec_id_item['id'], 'upload_id': self.writer.upload_id, 'spectrum_id': sid_result['spectrumID'], - 'spectra_data_ref': sid_result['spectraData_ref'], + 'spectra_data_id': sd_int_id, 'pep1_id': spec_id_item['peptide_ref'], 'pep2_id': None, 'charge_state': int(spec_id_item['chargeState']), diff --git a/parser/database/create_db_schema.py b/parser/database/create_db_schema.py index 5851045..ebc6a74 100644 --- a/parser/database/create_db_schema.py +++ b/parser/database/create_db_schema.py @@ -20,7 +20,7 @@ def drop_db(connection_str): def create_schema(connection_str): - engine = create_engine(connection_str) + engine = create_engine(connection_str) # , echo=True) Base.metadata.create_all(engine) logging.info(Base.metadata.tables) engine.dispose() diff --git a/tests/test_MzIdParser_ecoli_dsso.py b/tests/test_MzIdParser_ecoli_dsso.py index fdc24c4..a5903b0 100644 --- a/tests/test_MzIdParser_ecoli_dsso.py +++ b/tests/test_MzIdParser_ecoli_dsso.py @@ -388,8 +388,9 @@ def test_psql_mgf_mzid_parser(tmpdir, use_database, engine): assert results[0].id == 'SII_3_1' # id from first assert results[0].spectrum_id == 'index=3' # spectrumID from # spectraData_ref from - assert results[0].spectra_data_ref == \ - 'SD_0_recal_B190717_13_HF_LS_IN_130_ECLP_DSSO_01_SCX23_hSAX05_rep2.mgf' + # assert results[0].spectra_data_ref == \ + # 'SD_0_recal_B190717_13_HF_LS_IN_130_ECLP_DSSO_01_SCX23_hSAX05_rep2.mgf' + assert results[0].spectra_data_id == 1 # peptide_ref from assert results[0].pep1_id == \ '6_VAEmetETPHLIHKVALDPLTGPMPYQGR_11_MGHAGAIIAGGKGTADEK_11_12_p1' @@ -1152,8 +1153,9 @@ def test_psql_mzml_mzid_parser(tmpdir, use_database, engine): # spectrumID from assert results[0].spectrum_id == 'controllerType=0 controllerNumber=1 scan=14905' # spectraData_ref from - assert results[0].spectra_data_ref == \ - 'SD_0_recal_B190717_13_HF_LS_IN_130_ECLP_DSSO_01_SCX23_hSAX05_rep2.mzML' + # assert results[0].spectra_data_ref == \ + # 'SD_0_recal_B190717_13_HF_LS_IN_130_ECLP_DSSO_01_SCX23_hSAX05_rep2.mzML' + assert results[0].spectra_data_id == 1 # peptide_ref from assert results[0].pep1_id == \ '6_VAEmetETPHLIHKVALDPLTGPMPYQGR_11_MGHAGAIIAGGKGTADEK_11_12_p1' From 2f8e4982c2e72e44e0bb3d9df3bfdb20432c9555 Mon Sep 17 00:00:00 2001 From: colin combe Date: Thu, 25 Jul 2024 08:33:43 +0100 Subject: [PATCH 2/2] assign int id to SpectrumIdentificationProtocol --- models/analysiscollectionspectrumidentification.py | 2 +- models/enzyme.py | 2 +- models/searchmodification.py | 2 +- models/spectrumidentificationprotocol.py | 9 +++++++-- parser/MzIdParser.py | 5 ++++- tests/test_MzIdParser_ecoli_dsso.py | 3 ++- 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/models/analysiscollectionspectrumidentification.py b/models/analysiscollectionspectrumidentification.py index 2e608e1..c311005 100644 --- a/models/analysiscollectionspectrumidentification.py +++ b/models/analysiscollectionspectrumidentification.py @@ -24,6 +24,6 @@ class AnalysisCollectionSpectrumIdentification(Base): __table_args__ = ( ForeignKeyConstraint( ["spectrum_identification_protocol_ref", "upload_id"], - ["spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"], + ["spectrumidentificationprotocol.sip_ref", "spectrumidentificationprotocol.upload_id"], ), ) diff --git a/models/enzyme.py b/models/enzyme.py index 8debedf..ddefb48 100644 --- a/models/enzyme.py +++ b/models/enzyme.py @@ -19,6 +19,6 @@ class Enzyme(Base): __table_args__ = ( ForeignKeyConstraint( ("protocol_id", "upload_id"), - ("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"), + ("spectrumidentificationprotocol.sip_ref", "spectrumidentificationprotocol.upload_id"), ), ) diff --git a/models/searchmodification.py b/models/searchmodification.py index 8f82bea..b6b8cce 100644 --- a/models/searchmodification.py +++ b/models/searchmodification.py @@ -19,6 +19,6 @@ class SearchModification(Base): __table_args__ = ( ForeignKeyConstraint( ("protocol_id", "upload_id"), - ("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"), + ("spectrumidentificationprotocol.sip_ref", "spectrumidentificationprotocol.upload_id"), ), ) diff --git a/models/spectrumidentificationprotocol.py b/models/spectrumidentificationprotocol.py index f571879..414380a 100644 --- a/models/spectrumidentificationprotocol.py +++ b/models/spectrumidentificationprotocol.py @@ -1,17 +1,22 @@ from sqlalchemy.orm import Mapped, mapped_column -from sqlalchemy import ForeignKey, Text, JSON, Integer, Float +from sqlalchemy import ForeignKey, Text, JSON, Integer, Float, UniqueConstraint from models.base import Base from typing import Optional, Any class SpectrumIdentificationProtocol(Base): __tablename__ = "spectrumidentificationprotocol" - id: Mapped[str] = mapped_column(Text, primary_key=True, nullable=False) + id: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) upload_id: Mapped[str] = mapped_column(Integer, ForeignKey("upload.id"), index=True, primary_key=True, nullable=False) + sip_ref: Mapped[str] = mapped_column(Text, primary_key=False, nullable=False) analysis_software: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True) search_type: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True) additional_search_params: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True) threshold: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True) frag_tol: Mapped[float] = mapped_column(Float, nullable=False) frag_tol_unit: Mapped[str] = mapped_column(Text, nullable=False) + + __table_args__ = ( + UniqueConstraint("sip_ref", "upload_id"), + ) diff --git a/parser/MzIdParser.py b/parser/MzIdParser.py index a47d621..897acad 100644 --- a/parser/MzIdParser.py +++ b/parser/MzIdParser.py @@ -205,6 +205,7 @@ def parse_analysis_protocol_collection(self): sid_protocols = [] search_modifications = [] enzymes = [] + sip_int_id = 0 for sid_protocol_id in self.mzid_reader._offset_index['SpectrumIdentificationProtocol'].keys(): try: sid_protocol = self.mzid_reader.get_by_id(sid_protocol_id, detailed=True) @@ -251,8 +252,9 @@ def parse_analysis_protocol_collection(self): # Threshold threshold = sid_protocol.get('Threshold', {}) data = { - 'id': sid_protocol['id'], + 'id': sip_int_id, 'upload_id': self.writer.upload_id, + 'sip_ref': sid_protocol['id'], 'search_type': sid_protocol['SearchType'], 'frag_tol': frag_tol_value, 'frag_tol_unit': frag_tol_unit, @@ -383,6 +385,7 @@ def parse_analysis_protocol_collection(self): }) sid_protocols.append(data) + sip_int_id += 1 self.mzid_reader.reset() self.logger.info('parsing AnalysisProtocolCollection - done. Time: {} sec'.format( diff --git a/tests/test_MzIdParser_ecoli_dsso.py b/tests/test_MzIdParser_ecoli_dsso.py index a5903b0..13487e1 100644 --- a/tests/test_MzIdParser_ecoli_dsso.py +++ b/tests/test_MzIdParser_ecoli_dsso.py @@ -254,7 +254,8 @@ def compare_enzyme(results): def compare_spectrum_identification_protocol(results): assert len(results) == 1 # parsed from in - assert results[0].id == 'SearchProtocol_1_0' # id from + assert results[0].id == 0 + assert results[0].sip_ref == 'SearchProtocol_1_0' # id from assert results[0].frag_tol == 5.0 assert results[0].frag_tol_unit == 'ppm' # cvParams from 'ion series considered in search' (MS:1002473)