From 820ff5fcec2c40123082a1f7d20409ccc8c80953 Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Wed, 8 Jan 2025 16:12:01 +0000 Subject: [PATCH] script updates, add hash to results --- .../solr/grebi_link_results/Cargo.toml | 5 ++- .../solr/grebi_link_results/src/main.rs | 9 ++++++ dataload/Cargo.lock | 2 ++ dataload/nextflow/load_subgraph.nf | 12 +++---- dataload/scripts/check_datarelease.sh | 0 dataload/scripts/ebi_datarelease_to_ftp.sh | 31 +++++++++++++++++++ .../scripts/ebi_datarelease_to_staging.sh | 6 ++-- 7 files changed, 55 insertions(+), 10 deletions(-) mode change 100644 => 100755 dataload/scripts/check_datarelease.sh diff --git a/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml b/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml index 72ff6fb..4313c0b 100644 --- a/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml +++ b/dataload/08_create_other_dbs/solr/grebi_link_results/Cargo.toml @@ -11,4 +11,7 @@ csv = "1.3.0" lmdb-zero = "0.4.4" bloomfilter = "1.0.13" jemallocator = "0.5.4" -clap = { version = "4.4.11", features = ["derive"] } \ No newline at end of file +clap = { version = "4.4.11", features = ["derive"] } +sha1 = "0.10.6" +hex = "0.4.3" + diff --git a/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs b/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs index 013b338..283b213 100644 --- a/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs +++ b/dataload/08_create_other_dbs/solr/grebi_link_results/src/main.rs @@ -10,6 +10,7 @@ use std::io::{Write, BufWriter}; use grebi_shared::json_lexer::{lex, JsonTokenType}; use grebi_shared::json_parser::JsonParser; use clap::Parser; +use sha1::{Sha1, Digest}; use serde_json::Value; @@ -91,6 +92,14 @@ fn main() { json.insert("_refs".to_string(), Value::Object(refs)); json.insert("_node_ids".to_string(), Value::Array( nodeids.iter().map(|id| Value::String(id.clone())).collect())); + // sha1 not for security, just as a simple way to assign a unique + // id to the result to use to reference it in the api + // + let mut hasher = Sha1::new(); + hasher.update(&line); + let hash = hasher.finalize(); + json.insert("id".to_string(), Value::String(hex::encode(hash))); + writer.write_all(Value::Object(json).to_string().as_bytes()).unwrap(); writer.write_all("\n".as_bytes()).unwrap(); diff --git a/dataload/Cargo.lock b/dataload/Cargo.lock index c36c9ed..49267e4 100644 --- a/dataload/Cargo.lock +++ b/dataload/Cargo.lock @@ -502,9 +502,11 @@ dependencies = [ "clap", "csv", "grebi_shared", + "hex", "jemallocator", "lmdb-zero", "serde_json", + "sha1", ] [[package]] diff --git a/dataload/nextflow/load_subgraph.nf b/dataload/nextflow/load_subgraph.nf index a8d795a..a167961 100644 --- a/dataload/nextflow/load_subgraph.nf +++ b/dataload/nextflow/load_subgraph.nf @@ -300,7 +300,7 @@ process create_sqlite { errorStrategy 'retry' maxRetries 10 - publishDir "${params.out}/${params.subgraph}", overwrite: true + publishDir "${params.out}", overwrite: true input: val(compressed_blobs) @@ -424,7 +424,7 @@ process run_materialised_queries { time "8h" cpus "8" - publishDir "${params.out}/${params.subgraph}", overwrite: true + publishDir "${params.out}", overwrite: true input: path(neo_db) @@ -500,7 +500,7 @@ process add_query_metadatas_to_graph_metadata { time "8h" cpus "8" - publishDir "${params.out}/${params.subgraph}", overwrite: true + publishDir "${params.out}", overwrite: true input: path(metadata_jsons) @@ -526,7 +526,7 @@ process csvs_to_sqlite { time "12h" cpus "8" - publishDir "${params.out}/${params.subgraph}", overwrite: true + publishDir "${params.out}", overwrite: true input: path(csvs) @@ -659,7 +659,7 @@ process package_neo { time "8h" cpus "8" - publishDir "${params.out}${params.subgraph}", overwrite: true + publishDir "${params.out}", overwrite: true input: path("${params.subgraph}_neo4j") @@ -679,7 +679,7 @@ process package_solr { time "8h" cpus "8" - publishDir "${params.out}/${params.subgraph}", overwrite: true + publishDir "${params.out}", overwrite: true input: path(cores) diff --git a/dataload/scripts/check_datarelease.sh b/dataload/scripts/check_datarelease.sh old mode 100644 new mode 100755 diff --git a/dataload/scripts/ebi_datarelease_to_ftp.sh b/dataload/scripts/ebi_datarelease_to_ftp.sh index 05a7907..b641be7 100755 --- a/dataload/scripts/ebi_datarelease_to_ftp.sh +++ b/dataload/scripts/ebi_datarelease_to_ftp.sh @@ -1,2 +1,33 @@ #!/bin/bash +#!/bin/bash + +if [ "$SLURM_JOB_PARTITION" != "datamover" ]; then + echo "Must run on a datamover node" + exit 1 +fi + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +SUBGRAPH=$1 +DATARELEASE_PATH=$2 +VERSION=$(date +"%Y-%b-%d") + +./check_datarelease.sh $SUBGRAPH $DATARELEASE_PATH + +FTP_PATH=/nfs/ftp/public/databases/spot/kg/$SUBGRAPH/$VERSION +LATEST_PATH=/nfs/ftp/public/databases/spot/kg/$SUBGRAPH/latest + +echo "Copying $DATARELEASE_PATH to $FTP_PATH" + +rm -rf $FTP_PATH/* +cp -Lr $DATARELEASE_PATH/* $FTP_PATH/ +ln -s $FTP_PATH $LATEST_PATH + + + + + diff --git a/dataload/scripts/ebi_datarelease_to_staging.sh b/dataload/scripts/ebi_datarelease_to_staging.sh index 52384d9..caa9855 100755 --- a/dataload/scripts/ebi_datarelease_to_staging.sh +++ b/dataload/scripts/ebi_datarelease_to_staging.sh @@ -39,9 +39,9 @@ rm -rf $STAGING_PATH/sqlite/${SUBGRAPH}.sqlite3 echo Extracting new data release tar --use-compress-program=pigz -xf $DATARELEASE_PATH/${SUBGRAPH}_neo4j.tgz -C $STAGING_PATH/neo4j -tar --use-compress-program=pigz -xf $DATARELEASE_PATH/${SUBGRAPH}_solr.tgz -C $STAGING_PATH/solr -cp -f $DATARELEASE_PATH/${SUBGRAPH}_metadata.json $STAGING_PATH/metadata -cp -f $DATARELEASE_PATH/${SUBGRAPH}.sqlite3 $STAGING_PATH/sqlite +tar --use-compress-program=pigz -xf $DATARELEASE_PATH/${SUBGRAPH}_solr.tgz -C $STAGING_PATH +cp -f $DATARELEASE_PATH/${SUBGRAPH}_metadata.json $STAGING_PATH/metadata/ +cp -f $DATARELEASE_PATH/${SUBGRAPH}.sqlite3 $STAGING_PATH/sqlite/