Skip to content

Commit

Permalink
Merge pull request #21 from infinity-a11y/version_1.5.0
Browse files Browse the repository at this point in the history
Merge Version 1.5.0 into Master
  • Loading branch information
infinity-a11y authored Aug 21, 2024
2 parents c0306bb + 6119b6b commit 5af8e7f
Show file tree
Hide file tree
Showing 18 changed files with 6,018 additions and 2,568 deletions.
5,984 changes: 4,239 additions & 1,745 deletions App.R

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions PhyloTrace.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ dependencies:
- r-base=4.3.2
- r-remotes=2.5.0
- kma=1.4.14
- ncbi-amrfinderplus=3.12.8
- parallel=20240522
- pblat=2.5.1
- r-bh=1.81.0-1
Expand Down Expand Up @@ -92,7 +93,7 @@ dependencies:
- r-jquerylib=0.1.4
- r-jsonlite=1.8.8
- r-kableExtra=1.3.4
- r-knitr=1.47
- r-knitr=1.48
- r-labeling=0.4.3
- r-later=1.3.1
- r-lattice=0.22_5
Expand Down Expand Up @@ -159,7 +160,7 @@ dependencies:
- r-tidytree=0.4.5
- r-tidyverse=2.0.0
- r-timechange=0.2.0
- r-tinytex=0.51
- r-tinytex=0.52
- r-tzdb=0.4.0
- r-utf8=1.2.3
- r-uuid=1.1_1
Expand All @@ -169,9 +170,9 @@ dependencies:
- r-vroom=1.6.4
- r-webshot=0.5.5
- r-withr=2.5.1
- r-xfun=0.45
- r-xfun=0.46
- r-xml2=1.3.5
- r-xtable=1.8_4
- r-yaml=2.3.8
- r-yaml=2.3.10
- r-yulab.utils=0.1.0
- r-zoo=1.8_12
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ development. To get a stable version download the newest release.
![PartnerLogos](www/partners_logo_round.svg)

<sup><sup> Developed in collaboration with Hochschule Furtwangen University (HFU) and Medical
University of Graz (MUG). Featured on ShinyConf 2024. </sup> </sup>
University of Graz (MUG). Featured on ShinyConf 2024 and R/Medicine 2024. </sup> </sup>

[![DOI](https://img.shields.io/badge/DOI-10.5281%2Fzenodo.10996423-00e896?labelColor=gray&color=2500ba&logoColor=black)](https://doi.org/10.5281/zenodo.10996423)
[![License: GPL
Expand Down
92 changes: 62 additions & 30 deletions execute/check_duplicate_multi.R
Original file line number Diff line number Diff line change
@@ -1,35 +1,33 @@
library(logr)
# Get the command line arguments
args <- commandArgs(trailingOnly = TRUE)

# Access the first argument
base_path <- args[1]

# Get selected assembly file names
file_names <- list.files(paste0(getwd(), "/selected_genomes"), full.names = T)

# Function to log messages
log.message <- function(log_file, message) {
cat(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "-", message, "\n", file = log_file, append = TRUE)
}

logfile <- file.path(paste0(base_path, "/logs/check_duplicate_multi.log"))

log <- log_open(logfile, logdir = FALSE)

log_print("Initiated multi typing fasta name duplicates check")

# load selected assemblies
assemblies <- lapply(list.files(paste0(getwd(), "/selected_genomes"), full.names = T), readLines)

# loop through every assembly
for(i in 1:length(assemblies)){
names <- stringr::str_extract(assemblies[[i]][seq(1, length(assemblies[[i]]), by = 3)], "^[^\\s]+")
process_fasta <- function(fasta_path) {
# Read the FASTA file into a data.table
dt <- data.table::fread(fasta_path, header = FALSE, sep = "\n", col.names = "line", data.table = TRUE)

# Identify headers and sequence lines
dt[, is_header := grepl("^>", line)]

# Create a group identifier for each sequence based on headers
dt[, group := cumsum(is_header)]

# Process each group to concatenate sequences, keeping headers as is
result <- dt[, .(header = line[1],
sequence = paste(line[!is_header], collapse = "")),
by = group]

# Prepare the final output as a character vector
# Ensure exactly one empty line between sequence end and next header
output <- unlist(result[, .(output = c(header, sequence, "")), by = group]$output)

# Remove the last empty line to avoid trailing empty line in the file
output <- output[-length(output)]

names <- stringr::str_extract(output[seq(1, length(output), by = 3)], "^[^\\s]+")

# Test if there are duplicates
if(length(names) != length(unique(names))){

log_print(paste0("Duplicate(s) present in ", basename(file_names[i])))
log_print(paste0("Duplicate(s) present in ", basename(fasta_path)))

# add a number to the duplicates
for(j in 1:length(names)){
Expand All @@ -41,12 +39,46 @@ for(i in 1:length(assemblies)){

# substitute the respective lines in the file with the new names
for(k in 1:length(names)){
assemblies[[i]][3*k - 2] <- paste0(names[k])
output[3*k - 2] <- paste0(names[k])
}

# save the new assembly
writeLines(assemblies[[i]], file_names[i])
}
# save formatted fasta
writeLines(output, fasta_path)

# Return invisible NULL to suppress output
invisible(NULL)

} else {

# save formatted fasta
writeLines(output, fasta_path)

# Return invisible NULL to suppress output
invisible(NULL)
}
}

# Get the command line arguments
args <- commandArgs(trailingOnly = TRUE)

# Access the first argument
base_path <- args[1]

# Get selected assembly file names
assemblies <- list.files(paste0(getwd(), "/selected_genomes"), full.names = T)

# Function to log messages
log.message <- function(log_file, message) {
cat(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "-", message, "\n", file = log_file, append = TRUE)
}

logfile <- file.path(paste0(base_path, "/logs/check_duplicate_multi.log"))

log <- log_open(logfile, logdir = FALSE)

log_print("Initiated multi typing fasta check and formatting")

# Check and format fasta of assemblies
invisible(lapply(assemblies, process_fasta))

log_close()
88 changes: 61 additions & 27 deletions execute/check_duplicate_single.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,63 @@
library(logr)

process_fasta <- function(fasta_path) {
# Read the FASTA file into a data.table
dt <- data.table::fread(fasta_path, header = FALSE, sep = "\n", col.names = "line", data.table = TRUE)

# Identify headers and sequence lines
dt[, is_header := grepl("^>", line)]

# Create a group identifier for each sequence based on headers
dt[, group := cumsum(is_header)]

# Process each group to concatenate sequences, keeping headers as is
result <- dt[, .(header = line[1],
sequence = paste(line[!is_header], collapse = "")),
by = group]

# Prepare the final output as a character vector
# Ensure exactly one empty line between sequence end and next header
output <- unlist(result[, .(output = c(header, sequence, "")), by = group]$output)

# Remove the last empty line to avoid trailing empty line in the file
output <- output[-length(output)]

names <- stringr::str_extract(output[seq(1, length(output), by = 3)], "^[^\\s]+")

# Test if there are duplicates
if(length(names) != length(unique(names))){

log_print(paste0("Duplicate(s) present in ", basename(fasta_path)))

# add a number to the duplicates
for(j in 1:length(names)){
if(sum(names == names[j]) > 1){
indices <- which(names == names[j])
names[j] <- paste0(names[j], "_", which(names == names[j]))
}
}

# substitute the respective lines in the file with the new names
for(k in 1:length(names)){
output[3*k - 2] <- paste0(names[k])
}

# save formatted fasta
writeLines(output, paste0(getwd(), "/blat_single/", basename(fasta_path)))

# Return invisible NULL to suppress output
invisible(NULL)

} else {

# save formatted fasta
writeLines(output, paste0(getwd(), "/blat_single/", basename(fasta_path)))

# Return invisible NULL to suppress output
invisible(NULL)
}
}

typing_meta <- readRDS(paste0(getwd(), "/single_typing_df.rds"))

# Function to log messages
Expand All @@ -14,32 +73,7 @@ log_print("Initiated single typing fasta name duplicates check")

assembly <- typing_meta$genome

lines <- readLines(assembly)

names <- stringr::str_extract(lines[seq(1, length(lines), by = 3)], "^[^\\s]+")

# Test if there are duplicates
if(length(names) != length(unique(names))) {

log_print(paste0("Duplicate(s) present in ", basename(assembly)))

# add a number to the duplicates
for(i in 1:length(names)) {
if(sum(names == names[i]) > 1) {
indices <- which(names == names[i])
names[i] <- paste0(names[i], "_", indices[which(indices == i)])
}
}

# substitute the respective lines in the file with the new names
for(i in 1:length(names)) {
lines[3*i - 2] <- paste0(names[i])
}

# save the new assembly to working directory
writeLines(lines, paste0(getwd(), "/blat_single/assembly.fasta"))
} else {
writeLines(lines, paste0(getwd(), "/blat_single/assembly.fasta"))
}
# Check and format fasta of assemblies
invisible(lapply(assembly, process_fasta))

log_close()
2 changes: 1 addition & 1 deletion execute/kill_multi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ else
kill "$PID"
fi

echo 0 > $log_file
echo 0 > $log_file
Loading

0 comments on commit 5af8e7f

Please sign in to comment.