Skip to content

Commit

Permalink
Merge pull request #33 from galaxy-genome-annotation/fix_bulk_gff3_fe…
Browse files Browse the repository at this point in the history
…ature_import

begin to refactor gff3
  • Loading branch information
abretaud authored Jun 26, 2020
2 parents 7128414 + 193bd46 commit d559ca0
Show file tree
Hide file tree
Showing 23 changed files with 3,859 additions and 128 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ cover
docs/_build

# Python virtualenv
.venv
.venv*

# test harness
test_harness.py
Expand Down
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ Or with the Arrow client:
History
-------

- 4.2.3
- Fixed `load_gff3` to more accurately load transcripts including the CDS as well as handle non-coding types more accurately.
- 4.2.2
- Drastically speed up load_gff3
- `load_gff3` now uses the Apollo `add_transcript` method if it is a gene or mRNA type
Expand Down
258 changes: 147 additions & 111 deletions apollo/annotations/__init__.py

Large diffs are not rendered by default.

150 changes: 147 additions & 3 deletions apollo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,139 @@ def AssertAdmin(user):


def _tnType(feature):
if feature.type in ('gene', 'mRNA', 'exon', 'CDS', 'terminator', 'tRNA'):
if feature.type in ('gene', 'mRNA', 'exon', 'CDS', 'terminator', 'tRNA', 'snRNA', 'snoRNA', 'ncRNA', 'rRNA', 'miRNA', 'repeat_region', 'transposable_element', 'pseudogene', 'transcript'):
return feature.type
else:
return 'exon'


def _yieldGeneData(gene, disable_cds_recalculation=False, use_name=False):
current = _yieldSubFeatureData(gene, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name)

if gene.sub_features:
current['children'] = []
for sf in gene.sub_features:
if _tnType(sf) in coding_transcript_types:
current['children'].append(_yieldCodingTranscriptData(sf, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name))
elif _tnType(sf) in noncoding_transcript_types:
current['children'].append(_yieldNonCodingTranscriptData(sf, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name))

# # TODO: handle comments
# # TODO: handle dbxrefs
# # TODO: handle attributes
# # TODO: handle aliases
# # TODO: handle description
# # TODO: handle GO, Gene Product, Provenance

if 'children' in current and gene.type == 'gene':
# Only sending mRNA level as apollo is more comfortable with orphan mRNAs
return current['children']
else:
# No children, return a generic gene feature
return current


def _yieldSubFeatureData(f, disable_cds_recalculation=False, use_name=False):
current = {
'location': {
'strand': f.strand,
'fmin': int(f.location.start),
'fmax': int(f.location.end),
},
'type': {
'name': _tnType(f),
'cv': {
'name': 'sequence',
}
},
}
if disable_cds_recalculation:
current['use_cds'] = 'true'

if f.type in (coding_transcript_types + noncoding_transcript_types + gene_types + pseudogenes_types
+ single_level_feature_types):
current['name'] = f.qualifiers.get('Name', [f.id])[0]

if 'ID' in f.qualifiers:
current['gff_id'] = f.qualifiers['ID'][0]

if use_name:
current['use_name'] = True

# if OGS:
# TODO: handle comments
# TODO: handle dbxrefs
# TODO: handle attributes
# TODO: handle aliases
# TODO: handle description
# TODO: handle GO, Gene Product, Provenance
return current


def _yieldCodingTranscriptData(f, disable_cds_recalculation=False, use_name=False):
current = {
'location': {
'strand': f.strand,
'fmin': int(f.location.start),
'fmax': int(f.location.end),
},
'type': {
'name': _tnType(f),
'cv': {
'name': 'sequence',
}
},
}

if f.type in (coding_transcript_types + noncoding_transcript_types + gene_types + pseudogenes_types
+ single_level_feature_types):
current['name'] = f.qualifiers.get('Name', [f.id])[0]

if 'ID' in f.qualifiers:
current['gff_id'] = f.qualifiers['ID'][0]

if len(f.sub_features) > 0:
current['children'] = []
for sf in f.sub_features:
current['children'].append(
_yieldSubFeatureData(sf, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name))

return current


def _yieldNonCodingTranscriptData(features, disable_cds_recalculation=False, use_name=False):
return _yieldCodingTranscriptData(features, disable_cds_recalculation, use_name)


# def _yieldSingleLevelFeatureData(features):
# return _yieldSubFeatureData(features[0])


def yieldApolloData(feature, use_name=False, disable_cds_recalculation=False):
feature_type = _tnType(feature)
if feature_type in gene_types:
return _yieldGeneData(feature)
elif feature_type in pseudogenes_types:
return _yieldGeneData(feature)
elif feature_type in coding_transcript_types:
return _yieldCodingTranscriptData(feature)
elif feature_type in noncoding_transcript_types:
return _yieldNonCodingTranscriptData(feature)
elif feature_type in single_level_feature_types:
# return _yieldSingleLevelFeatureData(current_feature)
return _yieldSubFeatureData(feature)
else:
return _yieldSubFeatureData(feature)

# # if OGS:
# # TODO: handle comments
# # TODO: handle dbxrefs
# # TODO: handle attributes
# # TODO: handle aliases
# # TODO: handle description
# # TODO: handle GO, Gene Product, Provenance


def _yieldFeatData(features, use_name=False, disable_cds_recalculation=False):
for f in features:
current = {
Expand All @@ -110,14 +237,17 @@ def _yieldFeatData(features, use_name=False, disable_cds_recalculation=False):
}
},
}
if disable_cds_recalculation is True:
if disable_cds_recalculation:
current['use_cds'] = 'true'

if f.type in (coding_transcript_types + noncoding_transcript_types + gene_types + pseudogenes_types
+ single_level_feature_types):
current['name'] = f.qualifiers.get('Name', [f.id])[0]

if use_name is True:
if 'ID' in f.qualifiers:
current['gff_id'] = f.qualifiers['ID'][0]

if use_name:
current['use_name'] = True

# if OGS:
Expand Down Expand Up @@ -150,6 +280,20 @@ def add_property_to_feature(feature, property_key, property_value):
return feature


def features_to_apollo_schema(features, use_name=False, disable_cds_recalculation=False):
"""
:param disable_cds_recalculation:
:param use_name:
:param features:
:return:
"""
compiled = []
for f in features:
compiled.append(yieldApolloData(f, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation))
return compiled


def features_to_feature_schema(features, use_name=False, disable_cds_recalculation=False):
"""
Expand Down
2 changes: 1 addition & 1 deletion arrow/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '4.2.2'
__version__ = '4.2.3'
9 changes: 2 additions & 7 deletions arrow/commands/annotations/load_gff3.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,6 @@
help="Disable CDS recalculation and instead use the one provided",
is_flag=True
)
@click.option(
"--verbose",
help="Verbose mode",
is_flag=True
)
@click.option(
"--timing",
help="Output loading performance metrics",
Expand All @@ -46,11 +41,11 @@
@pass_context
@custom_exception
@str_output
def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, verbose=False, timing=False):
def cli(ctx, organism, gff3, source="", batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, timing=False):
"""Load a full GFF3 into annotation track
Output:
Loading report
"""
return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, verbose=verbose, timing=timing)
return ctx.gi.annotations.load_gff3(organism, gff3, source=source, batch_size=batch_size, test=test, use_name=use_name, disable_cds_recalculation=disable_cds_recalculation, timing=timing)
1 change: 0 additions & 1 deletion docs/commands/annotations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,6 @@ Load a full GFF3 into annotation track
--use_name Use the given name instead of generating one.
--disable_cds_recalculation Disable CDS recalculation and instead use the one
provided
--verbose Verbose mode
--timing Output loading performance metrics
-h, --help Show this message and exit.
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
requests
biopython
biopython==1.77
cachetools<4
click>=6.7
wrapt
pyyaml
decorator
bcbio-gff
bcbio-gff==0.6.6
pytest-timeit
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

setup(
name="apollo",
version='4.2.2',
version='4.2.3',
description="Apollo API library",
long_description=readme,
author="Helena Rasche;Anthony Bretaudeau;Nathan Dunn",
Expand Down
Loading

0 comments on commit d559ca0

Please sign in to comment.