Skip to content

Commit

Permalink
Merge pull request #77 from Rappsilber-Laboratory/fk-constraints
Browse files Browse the repository at this point in the history
assign int IDs to SpectraData and SpectrumIdentificationProtocol
  • Loading branch information
colin-combe authored Jul 25, 2024
2 parents bd9adf1 + 2f8e498 commit b9aada3
Show file tree
Hide file tree
Showing 11 changed files with 150 additions and 80 deletions.
3 changes: 2 additions & 1 deletion models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
"base",
"dbsequence",
"enzyme",
"match",
"modifiedpeptide",
"peptideevidence",
"projectdetail",
"projectsubdetail",
"searchmodification",
"spectradata",
"spectrum",
"match",
"spectrumidentificationprotocol",
"upload"
]
15 changes: 11 additions & 4 deletions models/analysiscollectionspectrumidentification.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy import ForeignKey, Text, ForeignKeyConstraint, Integer, Any, JSON
from models.base import Base
from models.match import Match


class AnalysisCollectionSpectrumIdentification(Base):
Expand All @@ -14,9 +15,15 @@ class AnalysisCollectionSpectrumIdentification(Base):
spectrum_identification_list_ref: Mapped[str] = mapped_column(Text, primary_key=True, nullable=False)
spectrum_identification_protocol_ref: Mapped[str] = mapped_column(Text, primary_key=False, nullable=False)
spectrum_identification_id: Mapped[str] = mapped_column(Text, primary_key=False, nullable=False)
# # actvity date as time ?
# activity_date:
# # name ?
# name:
spectra_data_refs: Mapped[dict[str, Any]] = mapped_column(JSON, primary_key=False, nullable=True)
search_database_refs: Mapped[dict[str, Any]] = mapped_column(JSON, primary_key=False, nullable=True)
ForeignKeyConstraint(
["spectrum_identification_protocol_ref", "upload_id"],
["spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"],
__table_args__ = (
ForeignKeyConstraint(
["spectrum_identification_protocol_ref", "upload_id"],
["spectrumidentificationprotocol.sip_ref", "spectrumidentificationprotocol.upload_id"],
),
)
8 changes: 5 additions & 3 deletions models/enzyme.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ class Enzyme(Base):
semi_specific: Mapped[bool] = mapped_column(BOOLEAN, nullable=True)
site_regexp: Mapped[str] = mapped_column(Text, nullable=True)
accession: Mapped[str] = mapped_column(Text, nullable=True)
ForeignKeyConstraint(
("protocol_id", "upload_id"),
("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"),
__table_args__ = (
ForeignKeyConstraint(
("protocol_id", "upload_id"),
("spectrumidentificationprotocol.sip_ref", "spectrumidentificationprotocol.upload_id"),
),
)
35 changes: 19 additions & 16 deletions models/match.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy import ForeignKey, Text, FLOAT, JSON, BOOLEAN, Integer, ForeignKeyConstraint, CHAR
from models.base import Base
from typing import Optional, Any
Expand All @@ -10,7 +10,7 @@ class Match(Base):
upload_id: Mapped[int] = mapped_column(Integer, ForeignKey("upload.id"), index=True, primary_key=True,
nullable=False)
spectrum_id: Mapped[str] = mapped_column(Text, nullable=True)
spectra_data_ref: Mapped[str] = mapped_column(Text, nullable=True)
spectra_data_id: Mapped[int] = mapped_column(Integer, nullable=True) # nullable for csv data
multiple_spectra_identification_id: Mapped[str] = mapped_column(Integer, nullable=True)
multiple_spectra_identification_pc: Mapped[str] = mapped_column(CHAR, nullable=True)
pep1_id: Mapped[str] = mapped_column(Text, nullable=False)
Expand All @@ -22,22 +22,25 @@ class Match(Base):
exp_mz: Mapped[float] = mapped_column(FLOAT, nullable=True)
calc_mz: Mapped[float] = mapped_column(FLOAT, nullable=True)
sil_id: Mapped[str] = mapped_column(Text, nullable=True) # null if from csv file
ForeignKeyConstraint(
["sil_id", "upload_id"],
["analysiscollectionspectrumidentification.spectrum_identification_list_ref",
"analysiscollectionspectrumidentification.upload_id"],
),
__table_args__ = (
ForeignKeyConstraint(
["sil_id", "upload_id"],
["analysiscollectionspectrumidentification.spectrum_identification_list_ref",
"analysiscollectionspectrumidentification.upload_id"],
),
ForeignKeyConstraint(
["pep1_id", "upload_id"],
["modifiedpeptide.id", "modifiedpeptide.upload_id"],
),
ForeignKeyConstraint(
["pep2_id", "upload_id"],
["modifiedpeptide.id", "modifiedpeptide.upload_id"],
)
)

# Can't use this ForeignKeyConstraint, because we want to allow people to upload data
# without spectra
# ForeignKeyConstraint(
# ["spectrum_id", "spectra_data_ref", "upload_id"],
# ["Spectrum.id", "Spectrum.spectra_data_ref", "Spectrum.upload_id"],
# ),
ForeignKeyConstraint(
["pep1_id", "upload_id"],
["modifiedpeptide.id", "modifiedpeptide.upload_id"],
),
ForeignKeyConstraint(
["pep2_id", "upload_id"],
["modifiedpeptide.id", "modifiedpeptide.upload_id"],
)
# ),
16 changes: 9 additions & 7 deletions models/peptideevidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ class PeptideEvidence(Base):
dbsequence_ref: Mapped[str] = mapped_column(Text, primary_key=True, nullable=False)
pep_start: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False)
is_decoy: Mapped[bool] = mapped_column(BOOLEAN, nullable=True)
ForeignKeyConstraint(
("dbsequence_ref", "upload_id"),
("dbsequence.id", "dbsequence.upload_id"),
)
ForeignKeyConstraint(
("peptide_ref", "upload_id"),
("modifiedpeptide.id", "modifiedpeptide.upload_id"),
__table_args__ = (
ForeignKeyConstraint(
("dbsequence_ref", "upload_id"),
("dbsequence.id", "dbsequence.upload_id"),
),
ForeignKeyConstraint(
("peptide_ref", "upload_id"),
("modifiedpeptide.id", "modifiedpeptide.upload_id"),
)
)
8 changes: 5 additions & 3 deletions models/searchmodification.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ class SearchModification(Base):
fixed_mod: Mapped[bool] = mapped_column(BOOLEAN, nullable=False)
accession: Mapped[str] = mapped_column(Text, nullable=True)
crosslinker_id: Mapped[str] = mapped_column(Text, nullable=True)
ForeignKeyConstraint(
("protocol_id", "upload_id"),
("spectrumidentificationprotocol.id", "spectrumidentificationprotocol.upload_id"),
__table_args__ = (
ForeignKeyConstraint(
("protocol_id", "upload_id"),
("spectrumidentificationprotocol.sip_ref", "spectrumidentificationprotocol.upload_id"),
),
)
20 changes: 20 additions & 0 deletions models/spectradata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import ForeignKey, Text, Integer, UniqueConstraint
from models.base import Base


class SpectraData(Base):
__tablename__ = "spectradata"
id: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False)
upload_id: Mapped[int] = mapped_column(Integer, ForeignKey("upload.id"), primary_key=True,
index=True, nullable=False)
spectra_data_ref: Mapped[str] = mapped_column(Text, nullable=False)
location: Mapped[str] = mapped_column(Text, nullable=False)
name: Mapped[str] = mapped_column(Text, nullable=True)
external_format_documentation: Mapped[str] = mapped_column(Text, nullable=True)
file_format: Mapped[str] = mapped_column(Text, nullable=False)
spectrum_id_format: Mapped[str] = mapped_column(Text, nullable=False)

__table_args__ = (
UniqueConstraint("spectra_data_ref", "upload_id"),
)
9 changes: 7 additions & 2 deletions models/spectrumidentificationprotocol.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
from sqlalchemy.orm import Mapped, mapped_column
from sqlalchemy import ForeignKey, Text, JSON, Integer, Float
from sqlalchemy import ForeignKey, Text, JSON, Integer, Float, UniqueConstraint
from models.base import Base
from typing import Optional, Any


class SpectrumIdentificationProtocol(Base):
__tablename__ = "spectrumidentificationprotocol"
id: Mapped[str] = mapped_column(Text, primary_key=True, nullable=False)
id: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False)
upload_id: Mapped[str] = mapped_column(Integer, ForeignKey("upload.id"), index=True,
primary_key=True, nullable=False)
sip_ref: Mapped[str] = mapped_column(Text, primary_key=False, nullable=False)
analysis_software: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True)
search_type: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True)
additional_search_params: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True)
threshold: Mapped[Optional[dict[str, Any]]] = mapped_column(JSON, nullable=True)
frag_tol: Mapped[float] = mapped_column(Float, nullable=False)
frag_tol_unit: Mapped[str] = mapped_column(Text, nullable=False)

__table_args__ = (
UniqueConstraint("sip_ref", "upload_id"),
)
101 changes: 63 additions & 38 deletions parser/MzIdParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self, mzid_path, temp_dir, peak_list_dir, writer, logger):
self.mzid_path = mzid_path

self.peak_list_readers = {} # peak list readers indexed by spectraData_ref
self.spectra_data_id_lookup = {} # spectra_data_ref to spectra_data_id lookup
self.temp_dir = temp_dir
if not self.temp_dir.endswith('/'):
self.temp_dir += '/'
Expand Down Expand Up @@ -81,8 +82,7 @@ def parse(self):
start_time = time()
self.upload_info() # overridden (empty function) in xiSPEC subclass
self.parse_analysis_protocol_collection()
if self.peak_list_dir:
self.init_peak_list_readers()
self.parse_spectradata_and_init_peak_list_readers()
self.parse_analysis_collection()
self.parse_db_sequences() # overridden (empty function) in xiSPEC subclass
self.parse_peptides()
Expand Down Expand Up @@ -117,7 +117,7 @@ def check_spectra_data_validity(sp_datum):
if 'location' not in sp_datum or sp_datum['location'] is None:
raise MzIdParseException('SpectraData is missing location')

def init_peak_list_readers(self):
def parse_spectradata_and_init_peak_list_readers(self):
"""
Sets self.peak_list_readers by looping through SpectraData elements
Expand All @@ -126,53 +126,76 @@ def init_peak_list_readers(self):
value: associated peak_list_reader
"""
peak_list_readers = {}
spectra_data = []
spectra_data_id_lookup = {}
sd_int_id = 0
for spectra_data_id in self.mzid_reader._offset_index["SpectraData"].keys():
sp_datum = self.mzid_reader.get_by_id(spectra_data_id, tag_id='SpectraData')

self.check_spectra_data_validity(sp_datum)

sd_id = sp_datum['id']
peak_list_file_name = ntpath.basename(sp_datum['location'])
peak_list_file_path = self.peak_list_dir + peak_list_file_name
file_format = sp_datum['FileFormat'].accession
spectrum_id_format = sp_datum['SpectrumIDFormat'].accession

# noinspection PyBroadException
try:
peak_list_reader = PeakListWrapper(
peak_list_file_path,
sp_datum['FileFormat'].accession,
sp_datum['SpectrumIDFormat'].accession
)
# ToDo: gz/zip code parts could do with refactoring
except Exception:
# try gz version
if self.peak_list_dir:
peak_list_file_path = self.peak_list_dir + peak_list_file_name
# noinspection PyBroadException
try:
peak_list_reader = PeakListWrapper(
PeakListWrapper.extract_gz(peak_list_file_path + '.gz'),
sp_datum['FileFormat'].accession,
sp_datum['SpectrumIDFormat'].accession
peak_list_file_path,
file_format,
spectrum_id_format
)
except IOError:
# look for missing peak lists in zip files
for file in os.listdir(self.peak_list_dir):
if file.endswith(".zip"):
zip_file = os.path.join(self.peak_list_dir, file)
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(self.peak_list_dir)
except IOError:
raise IOError()
# ToDo: gz/zip code parts could do with refactoring
except Exception:
# try gz version
try:
peak_list_reader = PeakListWrapper(
peak_list_file_path,
sp_datum['FileFormat'].accession,
sp_datum['SpectrumIDFormat'].accession
PeakListWrapper.extract_gz(peak_list_file_path + '.gz'),
file_format,
spectrum_id_format
)
except Exception:
raise MzIdParseException('Missing peak list file: %s' % peak_list_file_path)
except IOError:
# look for missing peak lists in zip files
for file in os.listdir(self.peak_list_dir):
if file.endswith(".zip"):
zip_file = os.path.join(self.peak_list_dir, file)
try:
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(self.peak_list_dir)
except IOError:
raise IOError()
try:
peak_list_reader = PeakListWrapper(
peak_list_file_path,
file_format,
spectrum_id_format
)
except Exception:
raise MzIdParseException('Missing peak list file: %s' % peak_list_file_path)

peak_list_readers[spectra_data_id] = peak_list_reader

spectra_datum = {
'id': sd_int_id,
'upload_id': self.writer.upload_id,
'spectra_data_ref': spectra_data_id,
'location': sp_datum['location'],
'name': sp_datum.get('name', None),
'external_format_documentation': sp_datum.get('externalFormatDocumentation', None),
'file_format': file_format,
'spectrum_id_format': spectrum_id_format
}

spectra_data.append(spectra_datum)
spectra_data_id_lookup[spectra_data_id] = sd_int_id
sd_int_id += 1

peak_list_readers[sd_id] = peak_list_reader
self.writer.write_data('spectradata', spectra_data)

self.peak_list_readers = peak_list_readers
self.spectra_data_id_lookup = spectra_data_id_lookup

def parse_analysis_protocol_collection(self):
"""Parse the AnalysisProtocolCollection and write SpectrumIdentificationProtocols."""
Expand All @@ -182,6 +205,7 @@ def parse_analysis_protocol_collection(self):
sid_protocols = []
search_modifications = []
enzymes = []
sip_int_id = 0
for sid_protocol_id in self.mzid_reader._offset_index['SpectrumIdentificationProtocol'].keys():
try:
sid_protocol = self.mzid_reader.get_by_id(sid_protocol_id, detailed=True)
Expand Down Expand Up @@ -228,8 +252,9 @@ def parse_analysis_protocol_collection(self):
# Threshold
threshold = sid_protocol.get('Threshold', {})
data = {
'id': sid_protocol['id'],
'id': sip_int_id,
'upload_id': self.writer.upload_id,
'sip_ref': sid_protocol['id'],
'search_type': sid_protocol['SearchType'],
'frag_tol': frag_tol_value,
'frag_tol_unit': frag_tol_unit,
Expand Down Expand Up @@ -360,6 +385,7 @@ def parse_analysis_protocol_collection(self):
})

sid_protocols.append(data)
sip_int_id += 1

self.mzid_reader.reset()
self.logger.info('parsing AnalysisProtocolCollection - done. Time: {} sec'.format(
Expand Down Expand Up @@ -453,8 +479,6 @@ def parse_peptides(self):
peptide_index = 0
peptides = []
for pep_id in self.mzid_reader._offset_index["Peptide"].keys():
if pep_id == 'peptide_1497_2_p1':
pass
peptide = self.mzid_reader.get_by_id(pep_id, tag_id='Peptide')
donor_link_site = None
acc_link_site = None
Expand Down Expand Up @@ -705,11 +729,12 @@ def main_loop(self):
msi_id = m[1]
msi_pc = m[2]

sd_int_id = self.spectra_data_id_lookup[sid_result['spectraData_ref']]
ident_data = {
'id': spec_id_item['id'],
'upload_id': self.writer.upload_id,
'spectrum_id': sid_result['spectrumID'],
'spectra_data_ref': sid_result['spectraData_ref'],
'spectra_data_id': sd_int_id,
'pep1_id': spec_id_item['peptide_ref'],
'pep2_id': None,
'charge_state': int(spec_id_item['chargeState']),
Expand Down
2 changes: 1 addition & 1 deletion parser/database/create_db_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def drop_db(connection_str):


def create_schema(connection_str):
engine = create_engine(connection_str)
engine = create_engine(connection_str) # , echo=True)
Base.metadata.create_all(engine)
logging.info(Base.metadata.tables)
engine.dispose()
Expand Down
Loading

0 comments on commit b9aada3

Please sign in to comment.