Skip to content

Commit

Permalink
merge data workflow to main (#48)
Browse files Browse the repository at this point in the history
Added data workflow for fastfold
  • Loading branch information
Gy-Lu authored Aug 23, 2022
1 parent a37c8b4 commit b254d46
Show file tree
Hide file tree
Showing 14 changed files with 432 additions and 17 deletions.
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,16 @@ FastFold provides a **high-performance implementation of Evoformer** with the fo
3. Ease of use
* Huge performance gains with a few lines changes
* You don't need to care about how the parallel part is implemented
4. Faster data processing, about 3x times faster than the original way

## Installation

To install and use FastFold, you will need:
+ Python 3.8 or later
+ Python 3.8 or 3.9.
+ [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 11.1 or above
+ PyTorch 1.10 or above


For now, You can install FastFold:
### Using Conda (Recommended)

Expand Down Expand Up @@ -116,6 +118,32 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--hhsearch_binary_path `which hhsearch` \
--kalign_binary_path `which kalign`
```
or run the script `./inference.sh`, you can change the parameter in the script, especisally those data path.
```shell
./inference.sh
```

#### inference with data workflow
Alphafold's data pre-processing takes a lot of time, so we speed up the data pre-process by [ray](https://docs.ray.io/en/latest/workflows/concepts.html) workflow, which achieves a 3x times faster speed. To run the intference with ray workflow, you should install the package and add parameter `--enable_workflow` to cmdline or shell script `./inference.sh`
```shell
pip install ray==1.13.0 pyarrow
```
```shell
python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
--output_dir ./ \
--gpus 2 \
--uniref90_database_path data/uniref90/uniref90.fasta \
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
--pdb70_database_path data/pdb70/pdb70 \
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
--jackhmmer_binary_path `which jackhmmer` \
--hhblits_binary_path `which hhblits` \
--hhsearch_binary_path `which hhsearch` \
--kalign_binary_path `which kalign` \
--enable_workflow
```


## Performance Benchmark

Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN conda install pytorch==1.10.0 torchvision torchaudio cudatoolkit=11.3 -c pyt
&& conda install hmmer==3.3.2 hhsuite=3.3.0 kalign2=2.04 -c bioconda

RUN pip install biopython==1.79 dm-tree==0.1.6 ml-collections==0.1.0 numpy==1.21.2 \
PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops
PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops ray==1.13.0 pyarrow

RUN pip install colossalai==0.1.8+torch1.10cu11.3 -f https://release.colossalai.org

Expand Down
1 change: 1 addition & 0 deletions fastfold/workflow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .workflow_run import batch_run
5 changes: 5 additions & 0 deletions fastfold/workflow/factory/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .task_factory import TaskFactory
from .hhblits import HHBlitsFactory
from .hhsearch import HHSearchFactory
from .jackhmmer import JackHmmerFactory
from .hhfilter import HHfilterFactory
29 changes: 29 additions & 0 deletions fastfold/workflow/factory/hhblits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from ray import workflow
from typing import List
from fastfold.workflow.factory import TaskFactory
from ray.workflow.common import Workflow
import fastfold.data.tools.hhblits as ffHHBlits

class HHBlitsFactory(TaskFactory):

keywords = ['binary_path', 'databases', 'n_cpu']

def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# setup runner
runner = ffHHBlits.HHBlits(
binary_path=self.config['binary_path'],
databases=self.config['databases'],
n_cpu=self.config['n_cpu']
)

# generate step function
@workflow.step
def hhblits_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
result = runner.query(fasta_path)
with open(output_path, "w") as f:
f.write(result["a3m"])

return hhblits_step.step(fasta_path, output_path, after)
33 changes: 33 additions & 0 deletions fastfold/workflow/factory/hhfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import subprocess
import logging
from ray import workflow
from typing import List
from fastfold.workflow.factory import TaskFactory
from ray.workflow.common import Workflow

class HHfilterFactory(TaskFactory):

keywords = ['binary_path']

def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# generate step function
@workflow.step
def hhfilter_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:

cmd = [
self.config.get('binary_path'),
]
if 'id' in self.config:
cmd += ['-id', str(self.config.get('id'))]
if 'cov' in self.config:
cmd += ['-cov', str(self.config.get('cov'))]
cmd += ['-i', fasta_path, '-o', output_path]

logging.info(f"HHfilter start: {' '.join(cmd)}")

subprocess.run(cmd)

return hhfilter_step.step(fasta_path, output_path, after)
38 changes: 38 additions & 0 deletions fastfold/workflow/factory/hhsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from fastfold.workflow.factory import TaskFactory
from ray import workflow
from ray.workflow.common import Workflow
import fastfold.data.tools.hhsearch as ffHHSearch
from typing import List

class HHSearchFactory(TaskFactory):

keywords = ['binary_path', 'databases', 'n_cpu']

def gen_task(self, a3m_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# setup runner
runner = ffHHSearch.HHSearch(
binary_path=self.config['binary_path'],
databases=self.config['databases'],
n_cpu=self.config['n_cpu']
)

# generate step function
@workflow.step
def hhsearch_step(a3m_path: str, output_path: str, after: List[Workflow], atab_path: str = None) -> None:

with open(a3m_path, "r") as f:
a3m = f.read()
if atab_path:
hhsearch_result, atab = runner.query(a3m, gen_atab=True)
else:
hhsearch_result = runner.query(a3m)
with open(output_path, "w") as f:
f.write(hhsearch_result)
if atab_path:
with open(atab_path, "w") as f:
f.write(atab)

return hhsearch_step.step(a3m_path, output_path, after)
34 changes: 34 additions & 0 deletions fastfold/workflow/factory/jackhmmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from fastfold.workflow.factory import TaskFactory
from ray import workflow
from ray.workflow.common import Workflow
import fastfold.data.tools.jackhmmer as ffJackHmmer
from fastfold.data import parsers
from typing import List

class JackHmmerFactory(TaskFactory):

keywords = ['binary_path', 'database_path', 'n_cpu', 'uniref_max_hits']

def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:

self.isReady()

# setup runner
runner = ffJackHmmer.Jackhmmer(
binary_path=self.config['binary_path'],
database_path=self.config['database_path'],
n_cpu=self.config['n_cpu']
)

# generate step function
@workflow.step
def jackhmmer_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
result = runner.query(fasta_path)[0]
uniref90_msa_a3m = parsers.convert_stockholm_to_a3m(
result['sto'],
max_sequences=self.config['uniref_max_hits']
)
with open(output_path, "w") as f:
f.write(uniref90_msa_a3m)

return jackhmmer_step.step(fasta_path, output_path, after)
50 changes: 50 additions & 0 deletions fastfold/workflow/factory/task_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from ast import keyword
import json
from ray.workflow.common import Workflow
from os import path
from typing import List

class TaskFactory:

keywords = []

def __init__(self, config: dict = None, config_path: str = None) -> None:

# skip if no keyword required from config file
if not self.__class__.keywords:
return

# setting config for factory
if config is not None:
self.config = config
elif config_path is not None:
self.loadConfig(config_path)
else:
self.loadConfig()

def configure(self, config: dict, purge=False) -> None:
if purge:
self.config = config
else:
self.config.update(config)

def configure(self, keyword: str, value: any) -> None:
self.config[keyword] = value

def gen_task(self, after: List[Workflow]=None, *args, **kwargs) -> Workflow:
raise NotImplementedError

def isReady(self):
for key in self.__class__.keywords:
if key not in self.config:
raise KeyError(f"{self.__class__.__name__} not ready: \"{key}\" not specified")

def loadConfig(self, config_path='./config.json'):
with open(config_path) as configFile:
globalConfig = json.load(configFile)
if 'tools' not in globalConfig:
raise KeyError("\"tools\" not found in global config file")
factoryName = self.__class__.__name__[:-7]
if factoryName not in globalConfig['tools']:
raise KeyError(f"\"{factoryName}\" not found in the \"tools\" section in config")
self.config = globalConfig['tools'][factoryName]
1 change: 1 addition & 0 deletions fastfold/workflow/template/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .fastfold_data_workflow import FastFoldDataWorkFlow
Loading

0 comments on commit b254d46

Please sign in to comment.