Skip to content

Commit

Permalink
Adjust to PubChemRDF from uniprot-kg
Browse files Browse the repository at this point in the history
  • Loading branch information
zmughal committed Feb 7, 2024
1 parent d180c6e commit 9817343
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 87 deletions.
61 changes: 0 additions & 61 deletions dvc.lock

This file was deleted.

4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ stages:
deps:
- stages/00_invalidate.sh
outs:
- checksum
- void
download:
cmd: stages/01_download.sh
deps:
- stages/01_download.sh
- checksum
- void
outs:
- download:
persist: true
Expand Down
16 changes: 8 additions & 8 deletions stages/00_invalidate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@
localpath=$(pwd)
echo "Local path: $localpath"

# Define the release URL for the dataset
checksum_url="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/RELEASE.metalink"
# Define the VoID URL for the dataset
void_url="https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/void.ttl"

# Create the checksum directory
checksumpath="$localpath/checksum"
echo "Checksum path: $checksumpath"
mkdir -p "$checksumpath"
cd $checksumpath;
# Create the VoID directory
voidpath="$localpath/void"
echo "VoID path: $voidpath"
mkdir -p "$voidpath"
cd $voidpath;

# Download file
wget -P $checksumpath $checksum_url
wget -P $voidpath $void_url

echo "Download done."
8 changes: 5 additions & 3 deletions stages/01_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
localpath=$(pwd)
echo "Local path: $localpath"

# Define the release URL for the dataset
metalink_url="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/RELEASE.metalink"
# Define the FTP URL for the dataset
# https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/
# ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/
ftp_url="ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/"

# Create the download directory
downloadpath="$localpath/download"
Expand All @@ -16,4 +18,4 @@ mkdir -p "$downloadpath"
cd $downloadpath;

# Download files
aria2c -c -d $downloadpath $metalink_url
aria2c -c -d $downloadpath $ftp_url
22 changes: 9 additions & 13 deletions stages/02_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -euo pipefail

# Script to convert RDF/XML to RDF HDT
# Script to convert Turtle to RDF HDT

# Get local path
localpath=$(pwd)
Expand All @@ -24,7 +24,7 @@ brickpath="$localpath/brick"
mkdir -p $brickpath
echo "Brick path: $brickpath"

base_uri="https://www.uniprot.org/"
base_uri="http://rdf.ncbi.nlm.nih.gov/pubchem/"

# Set TMPDIR on same filesystem
export BUILD_TMPDIR=$buildpath/tmp
Expand All @@ -37,29 +37,25 @@ export TMPDIR=$BUILD_TMPDIR
export buildpath_prestage=$buildpath/prestage
mkdir -p $buildpath_prestage

export buildpath brickpath base_uri
find $downloadpath -type f -name '*.rdf.xz' | sort \
| grep -vFf <( cat <<'EOF' # remove empty files
uniparc_patents.rdf.xz
EOF
) \
export downloadpath buildpath brickpath base_uri
find $downloadpath -type f -name '*.ttl.gz' | sort \
| parallel -J ./parallel.prf --bar '
set -euo pipefail;
RDF=$buildpath/{/.};
RDF_HDT="$buildpath_prestage"/"$(basename "$RDF" .rdf).hdt";
RDF_HDT_DIR_REL="$(realpath -s --relative-to="$downloadpath" {})";
RDF_HDT="$buildpath_prestage"/"$RDF_HDT_DIR_REL"/"$(basename "$RDF" .rdf).hdt";
export RDF2HDTCAT_JAVA_OPTS="-Xmx24g";
if [ ! -s $RDF_HDT ]; then
echo "Processing {}"
xz -T1 -dk < {} \
| rapper --input rdfxml --output ntriples - "$base_uri" \
gzip -dk < {} \
| rapper --input turtle --output ntriples - "$base_uri" \
| rdf2hdtcat-parpipe $base_uri $RDF_HDT
fi
'

find $downloadpath/ -maxdepth 1 \
-type f \! -name '*.rdf.xz' \
-type f \! -name '*.ttl.gz' \
-exec cp -v {} $brickpath/ \;

mv -v $buildpath_prestage/* $brickpath/

0 comments on commit 9817343

Please sign in to comment.