* @article{armah2024protein,
title={Protein function prediction through multi-view multi-label latent tensor reconstruction},
author={Armah-Sekum, Robert Ebo and Szedmak, Sandor and Rousu, Juho},
journal={BMC bioinformatics},
volume={25},
number={1},
pages={174},
year={2024},
publisher={Springer}
}
In this project, we utilized the latent tensor reconstruction (LTR
) approach to model the joint interactions between different protein features to predict protein functional terms (i.e: Gene Ontology
terms).
The code is developed using python>=3.8.
The main algorithm ./scripts/go_ltr_main.py is based on LTR
software
which is available at
GO-LTR
.
The following packages which can be downloaded free of charge on pypi, are required to run the file:
- numpy
- scipy
- itertools
- ./scripts/ltr_solver_multiview_0164.py - base LTR solver on which the go_ltr_main.py algorithm runs
- ./scripts/go_ltr_main.py - main file for running GO-LTR and generating predictions
The UniProtKB
IDs of the Swiss-prot manually reviewed protein sequences used for the study are in ./dataset directory.
Using the IDs one can find the full specification of each protein in the UniProtKB
database.
The ascession numbers obtained from the UniProtKB
search can then be used to query other databases such as AlphaFoldDB
, Rhea-DB
, etc, for specific protein feature information.
The full manually reviewed Swiss-prot sequences can be downloaded at https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/
Clustering of sequences was done with mmseqs2
dataset
: Contains theUniProtKB
IDs of all sequences used in our experiments. There are .txt files for each ontology branch: Molecular Function Ontology (MFO), Cellular Component Ontology (CCO) and Biological Process Ontology (BPO)images
: Contains the image files for the workflow of the GO-LTR modelscripts
: Contains the main script for training the GO-LTR model and generating predictions
We leveraged 3 different protein features: Sequence embeddings generated from ProtT5
Protein language model, InterPro fingerprints
and Protein-protein interaction (PPI) data from StringDB
.
As shown above, the functions associated with a particular protein forms a consistent graph in the Gene Ontology
(GO) graph. The functional terms also follow the true-path annotation rule -- where a protein annotated to a deep level term in the ontology is automatically annotated to all the parents of the child term.
Given a multi-view (multimodal) data sample
Given: a sample $$ \mathcal{S} =((\mathbf{x}_i^{(1)},\dots, \mathbf{x}_i^{(n_d)}), \mathbf{y}_i) \mid i\in [m] \qquad \mathbf{x}^{(d)}i \in \mathbb{R}^{n{x_d}},\ d\in [n_d] $$
We used the CAFA-evaluator
script for performance evaluation of the models considered under the study.
* @article{szedmak2020solution,
title={A solution for large scale nonlinear regression with high rank and degree at constant memory complexity via latent tensor reconstruction},
author={Szedmak, Sandor and Cichonska, Anna and Julkunen, Heli and Pahikkala, Tapio and Rousu, Juho},
journal={arXiv preprint arXiv:2005.01538},
year={2020}
}
* @article{wang2021modeling,
title={Modeling drug combination effects via latent tensor reconstruction},
author={Wang, Tianduanyi and Szedmak, Sandor and Wang, Haishan and Aittokallio, Tero and Pahikkala, Tapio and Cichonska, Anna and Rousu, Juho},
journal={Bioinformatics},
volume={37},
number={Supplement\_1},
pages={i93--i101},
year={2021},
publisher={Oxford University Press}
}
* @article{armah2024protein,
title={Protein function prediction through multi-view multi-label latent tensor reconstruction},
author={Armah-Sekum, Robert Ebo and Szedmak, Sandor and Rousu, Juho},
journal={BMC bioinformatics},
volume={25},
number={1},
pages={174},
year={2024},
publisher={Springer}
}