-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #452 from desci-labs/develop
promote main
- Loading branch information
Showing
39 changed files
with
1,519 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM docker.elastic.co/logstash/logstash:8.14.3 | ||
|
||
USER root | ||
|
||
# Install curl | ||
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* | ||
|
||
# Copy the initialization script | ||
COPY desci-elastic/init-logstash.sh /usr/local/bin/init-logstash.sh | ||
RUN chmod +x /usr/local/bin/init-logstash.sh | ||
|
||
USER logstash | ||
|
||
# Set the entrypoint to the initialization script | ||
ENTRYPOINT ["/usr/local/bin/init-logstash.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
|
||
# Configuration | ||
DRIVER_URL="https://jdbc.postgresql.org/download/postgresql-42.7.3.jar" | ||
DRIVER_DIR="/opt/logstash/drivers" | ||
DRIVER_FILE="$DRIVER_DIR/postgresql-42.7.3.jar" | ||
|
||
# Ensure the driver directory exists | ||
mkdir -p "$DRIVER_DIR" | ||
|
||
|
||
download_driver() { | ||
echo "Downloading PostgreSQL JDBC driver..." | ||
curl -# -o "$DRIVER_FILE" "$DRIVER_URL" | ||
chmod 644 "$DRIVER_FILE" | ||
echo "Driver downloaded and permissions set." | ||
} | ||
|
||
# Check if driver exists and download if necessary | ||
if [ -f "$DRIVER_FILE" ]; then | ||
echo "PostgreSQL JDBC driver already exists." | ||
else | ||
download_driver | ||
fi | ||
|
||
# Verify the driver file | ||
if [ ! -f "$DRIVER_FILE" ]; then | ||
echo "Error: Failed to download or locate the PostgreSQL JDBC driver." | ||
exit 1 | ||
fi | ||
|
||
# Ensure correct permissions on the driver file | ||
chmod 644 "$DRIVER_FILE" | ||
|
||
# Start Logstash with the provided pipeline configuration | ||
exec logstash -f /usr/share/logstash/pipeline/logstash.conf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
input { | ||
jdbc { | ||
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" | ||
jdbc_driver_class => "org.postgresql.Driver" | ||
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" | ||
jdbc_user => "${ES_DB_USER}" | ||
jdbc_password => "${ES_DB_PASSWORD}" | ||
statement => " | ||
SELECT | ||
id::text, | ||
orcid::text, | ||
display_name::text, | ||
display_name_alternatives::text, | ||
works_count::text, | ||
cited_by_count::text, | ||
last_known_institution::text, | ||
works_api_url::text, | ||
updated_date::text | ||
FROM openalex.authors | ||
WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP) | ||
ORDER BY updated_date ASC, id ASC | ||
LIMIT 1000 | ||
" | ||
use_column_value => true | ||
tracking_column => "updated_date" | ||
tracking_column_type => "timestamp" | ||
last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" | ||
jdbc_paging_enabled => true | ||
jdbc_page_size => 1000 | ||
} | ||
} | ||
|
||
filter { | ||
mutate { | ||
remove_field => ["@version", "@timestamp"] | ||
} | ||
json { | ||
source => "display_name_alternatives" | ||
target => "display_name_alternatives" | ||
skip_on_invalid_json => true | ||
} | ||
json { | ||
source => "last_known_institution" | ||
target => "last_known_institution" | ||
skip_on_invalid_json => true | ||
} | ||
mutate { | ||
convert => { | ||
"works_count" => "integer" | ||
"cited_by_count" => "integer" | ||
} | ||
} | ||
} | ||
|
||
output { | ||
elasticsearch { | ||
hosts => ["${ES_NODE}"] | ||
index => "authors" | ||
document_id => "%{id}" | ||
doc_as_upsert => true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
input { | ||
jdbc { | ||
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" | ||
jdbc_driver_class => "org.postgresql.Driver" | ||
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" | ||
jdbc_user => "${ES_DB_USER}" | ||
jdbc_password => "${ES_DB_PASSWORD}" | ||
statement => " | ||
SELECT | ||
work_id, | ||
author_id, | ||
author_position, | ||
raw_affiliation_string, | ||
institution_id | ||
FROM openalex.works_authorships | ||
ORDER BY work_id ASC | ||
LIMIT 10000 | ||
" | ||
jdbc_paging_enabled => true | ||
jdbc_page_size => 1000 | ||
} | ||
} | ||
|
||
filter { | ||
mutate { | ||
remove_field => ["@version", "@timestamp"] | ||
} | ||
} | ||
|
||
output { | ||
elasticsearch { | ||
hosts => ["${ES_NODE}"] | ||
index => "works_authorships" | ||
document_id => "%{work_id}-%{author_id}" | ||
doc_as_upsert => true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
input { | ||
jdbc { | ||
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" | ||
jdbc_driver_class => "org.postgresql.Driver" | ||
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" | ||
jdbc_user => "${ES_DB_USER}" | ||
jdbc_password => "${ES_DB_PASSWORD}" | ||
statement => " | ||
SELECT | ||
id::TEXT, | ||
orcid::TEXT, | ||
display_name::TEXT, | ||
display_name_alternatives::TEXT, | ||
works_count::TEXT, | ||
cited_by_count::TEXT, | ||
last_known_institution::TEXT, | ||
works_api_url::TEXT, | ||
updated_date::TEXT | ||
FROM openalex.authors | ||
WHERE updated_date > CAST(:sql_last_value AS TIMESTAMP) | ||
ORDER BY updated_date ASC, id ASC | ||
LIMIT 1000 | ||
" | ||
use_column_value => true | ||
tracking_column => "updated_date" | ||
tracking_column_type => "timestamp" | ||
last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" | ||
jdbc_paging_enabled => true | ||
jdbc_page_size => 1000 | ||
codec => json | ||
} | ||
} | ||
|
||
filter { | ||
mutate { | ||
remove_field => ["@version", "@timestamp"] | ||
} | ||
json { | ||
source => "display_name_alternatives" | ||
target => "display_name_alternatives" | ||
skip_on_invalid_json => true | ||
} | ||
json { | ||
source => "last_known_institution" | ||
target => "last_known_institution" | ||
skip_on_invalid_json => true | ||
} | ||
mutate { | ||
convert => { | ||
"works_count" => "integer" | ||
"cited_by_count" => "integer" | ||
} | ||
} | ||
} | ||
|
||
output { | ||
stdout { codec => json } | ||
elasticsearch { | ||
hosts => ["${ES_NODE}"] | ||
index => "authors" | ||
document_id => "%{id}" | ||
doc_as_upsert => true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# Imports a denormalized works table including the authors table and the works_authorships join table | ||
input { | ||
jdbc { | ||
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" | ||
jdbc_driver_class => "org.postgresql.Driver" | ||
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" | ||
jdbc_user => "${ES_DB_USER}" | ||
jdbc_password => "${ES_DB_PASSWORD}" | ||
jdbc_paging_enabled => true | ||
jdbc_page_size => 100 | ||
use_column_value => true | ||
tracking_column => "publication_date" | ||
tracking_column_type => "timestamp" | ||
last_run_metadata_path => "/usr/share/logstash/data/.logstash_jdbc_last_run" | ||
statement => " | ||
SELECT | ||
w.id::TEXT AS work_id, | ||
w.doi::TEXT, | ||
w.title::TEXT, | ||
w.publication_year::TEXT, | ||
w.type::TEXT, | ||
w.cited_by_count::TEXT AS cited_by_count, | ||
w.abstract_inverted_index::TEXT as abstract_inverted_index, | ||
w.publication_date::TIMESTAMP AS publication_date | ||
FROM openalex.works w | ||
WHERE w.publication_date::TIMESTAMP > CAST(:sql_last_value AS TIMESTAMP) | ||
ORDER BY w.publication_date::TIMESTAMP ASC, w.id ASC | ||
LIMIT 100 | ||
" | ||
codec => json | ||
} | ||
} | ||
|
||
filter { | ||
mutate { | ||
remove_field => ["@version", "@timestamp"] | ||
} | ||
json { | ||
source => "abstract_inverted_index" | ||
target => "abstract_inverted_index_parsed" | ||
} | ||
ruby { | ||
code => ' | ||
abstract_inverted_index = event.get("abstract_inverted_index_parsed") | ||
if abstract_inverted_index | ||
abstract_length = abstract_inverted_index.values.flatten.max + 1 | ||
abstract_words = Array.new(abstract_length, "") | ||
abstract_inverted_index.each do |word, positions| | ||
positions.each do |position| | ||
abstract_words[position] = word | ||
end | ||
end | ||
abstract = abstract_words.join(" ") | ||
event.set("abstract", abstract) | ||
end | ||
' | ||
} | ||
mutate { | ||
remove_field => ["abstract_inverted_index", "abstract_inverted_index_parsed"] | ||
convert => { | ||
"cited_by_count" => "integer" | ||
"publication_year" => "integer" | ||
} | ||
} | ||
jdbc_streaming { | ||
jdbc_driver_library => "/opt/logstash/drivers/postgresql-42.7.3.jar" | ||
jdbc_driver_class => "org.postgresql.Driver" | ||
jdbc_connection_string => "jdbc:postgresql://${ES_DB_HOST}:${ES_DB_PORT}/${ES_DB_NAME}" | ||
jdbc_user => "${ES_DB_USER}" | ||
jdbc_password => "${ES_DB_PASSWORD}" | ||
statement => " | ||
SELECT | ||
a.id AS author_id, | ||
wa.author_position, | ||
a.display_name AS author_name, | ||
a.works_count AS author_works_count, | ||
a.cited_by_count AS author_cited_by_count, | ||
wa.institution_id, | ||
a.orcid | ||
FROM openalex.works_authorships wa | ||
JOIN openalex.authors a ON wa.author_id = a.id | ||
WHERE wa.work_id = :work_id | ||
ORDER BY wa.author_position ASC | ||
" | ||
parameters => { "work_id" => "work_id" } | ||
target => "authors" | ||
} | ||
} | ||
|
||
output { | ||
stdout { codec => json } | ||
elasticsearch { | ||
hosts => ["${ES_NODE}"] | ||
index => "denormalized_works_test2" | ||
document_id => "%{[work_id]}" | ||
doc_as_upsert => true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.