Skip to content

Commit

Permalink
Adjust to PubChemRDF from uniprot-kg
Browse files Browse the repository at this point in the history
  • Loading branch information
zmughal committed Feb 7, 2024
1 parent d180c6e commit 9c498f1
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 98 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
logs
/checksum
/void
/download
/build
/brick
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# UniProt
# PubChemRDF-kg

<a href="https://github.com/biobricks-ai/uniprot-kg/actions"><img src="https://github.com/biobricks-ai/uniprot-kg/actions/workflows/bricktools-check.yaml/badge.svg?branch=master"/></a>
<a href="https://github.com/biobricks-ai/pubchemrdf-kg/actions"><img src="https://github.com/biobricks-ai/pubchemrdf-kg/actions/workflows/bricktools-check.yaml/badge.svg?branch=master"/></a>

## Description

> UniProt is a freely accessible database of protein sequence and functional
> information.
> The PubChem Compound, Substance, BioAssay databases in RDF format.
Homepage:
- <https://sparql.uniprot.org/>
- <https://www.uniprot.org/help/downloads>
- <https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/>
- <https://pubchem.ncbi.nlm.nih.gov/docs/rdf>
- <https://pubchem.ncbi.nlm.nih.gov/docs/rdf-ftp>
- <https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/>
61 changes: 0 additions & 61 deletions dvc.lock

This file was deleted.

4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ stages:
deps:
- stages/00_invalidate.sh
outs:
- checksum
- void
download:
cmd: stages/01_download.sh
deps:
- stages/01_download.sh
- checksum
- void
outs:
- download:
persist: true
Expand Down
4 changes: 2 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
description = "UniProt KG BioBrick";
description = "PubChemRDF KG BioBrick";

inputs = {
nixpkgs.url = "github:nixos/nixpkgs/nixos-23.05";
Expand All @@ -24,7 +24,7 @@
hdt-cpp.packages.${system}.default
hdt-java.packages.${system}.default
librdf
aria2
lftp
(lib.hiPrio pkgs.parallel-full) # prefer GNU Parallel over `moreutils`
moreutils
];
Expand Down
15 changes: 7 additions & 8 deletions stages/00_invalidate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
localpath=$(pwd)
echo "Local path: $localpath"

# Define the release URL for the dataset
checksum_url="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/RELEASE.metalink"
# Define the VoID URL for the dataset
void_url="https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/void.ttl"

# Create the checksum directory
checksumpath="$localpath/checksum"
echo "Checksum path: $checksumpath"
mkdir -p "$checksumpath"
cd $checksumpath;
# Create the VoID directory
voidpath="$localpath/void"
echo "VoID path: $voidpath"
mkdir -p "$voidpath"

# Download file
wget -P $checksumpath $checksum_url
wget -P $voidpath $void_url

echo "Download done."
12 changes: 8 additions & 4 deletions stages/01_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
localpath=$(pwd)
echo "Local path: $localpath"

# Define the release URL for the dataset
metalink_url="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/RELEASE.metalink"
# Define the FTP URL for the dataset
# https://ftp.ncbi.nlm.nih.gov/pubchem/RDF/
# ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/
ftp_url="ftp://ftp.ncbi.nlm.nih.gov/pubchem/RDF/"

# Create the download directory
downloadpath="$localpath/download"
echo "Download path: $downloadpath"
mkdir -p "$downloadpath"
cd $downloadpath;

# Download files
aria2c -c -d $downloadpath $metalink_url
(
cd "$downloadpath";
lftp -c "connect $ftp_url ; mirror --verbose -c -P $ftp_url"; \
);
22 changes: 9 additions & 13 deletions stages/02_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

set -euo pipefail

# Script to convert RDF/XML to RDF HDT
# Script to convert Turtle to RDF HDT

# Get local path
localpath=$(pwd)
Expand All @@ -24,7 +24,7 @@ brickpath="$localpath/brick"
mkdir -p $brickpath
echo "Brick path: $brickpath"

base_uri="https://www.uniprot.org/"
base_uri="http://rdf.ncbi.nlm.nih.gov/pubchem/"

# Set TMPDIR on same filesystem
export BUILD_TMPDIR=$buildpath/tmp
Expand All @@ -37,29 +37,25 @@ export TMPDIR=$BUILD_TMPDIR
export buildpath_prestage=$buildpath/prestage
mkdir -p $buildpath_prestage

export buildpath brickpath base_uri
find $downloadpath -type f -name '*.rdf.xz' | sort \
| grep -vFf <( cat <<'EOF' # remove empty files
uniparc_patents.rdf.xz
EOF
) \
export downloadpath buildpath brickpath base_uri
find $downloadpath -type f -name '*.ttl.gz' | sort \
| parallel -J ./parallel.prf --bar '
set -euo pipefail;
RDF=$buildpath/{/.};
RDF_HDT="$buildpath_prestage"/"$(basename "$RDF" .rdf).hdt";
RDF_HDT_DIR_REL="$(realpath -s --relative-to="$downloadpath" {})";
RDF_HDT="$buildpath_prestage"/"$RDF_HDT_DIR_REL"/"$(basename "$RDF" .rdf).hdt";
export RDF2HDTCAT_JAVA_OPTS="-Xmx24g";
if [ ! -s $RDF_HDT ]; then
echo "Processing {}"
xz -T1 -dk < {} \
| rapper --input rdfxml --output ntriples - "$base_uri" \
gzip -dk < {} \
| rapper --input turtle --output ntriples - "$base_uri" \
| rdf2hdtcat-parpipe $base_uri $RDF_HDT
fi
'

find $downloadpath/ -maxdepth 1 \
-type f \! -name '*.rdf.xz' \
-type f \! -name '*.ttl.gz' \
-exec cp -v {} $brickpath/ \;

mv -v $buildpath_prestage/* $brickpath/

0 comments on commit 9c498f1

Please sign in to comment.