From bfe633c065b7c93c46087dc513605c345e45c03b Mon Sep 17 00:00:00 2001 From: Matt Joiner Date: Tue, 23 May 2023 17:39:00 +1000 Subject: [PATCH] Move scripts to another repo Moved to https://github.com/getlantern/replica-scripts. This is because the large number of working files for bucket-sql causes the go tool to have issues. https://github.com/golang/go/issues/59860 --- scripts/bucket-sql/.gitignore | 4 -- scripts/bucket-sql/Makefile | 56 -------------------------- scripts/bucket-sql/README | 19 --------- scripts/bucket-sql/iran-item1.sql | 13 ------ scripts/bucket-sql/iran-item2.sql | 15 ------- scripts/bucket-sql/ned-b3.sql | 20 ---------- scripts/bucket-sql/schema | 35 ---------------- scripts/cleanup.py | 57 --------------------------- scripts/delete-sqs-queues | 23 ----------- scripts/nuke-infohashes/README | 1 - scripts/nuke-infohashes/buckets | 3 -- scripts/nuke-infohashes/infohashes | 6 --- scripts/nuke-infohashes/nuke-infohash | 19 --------- 13 files changed, 271 deletions(-) delete mode 100644 scripts/bucket-sql/.gitignore delete mode 100644 scripts/bucket-sql/Makefile delete mode 100644 scripts/bucket-sql/README delete mode 100644 scripts/bucket-sql/iran-item1.sql delete mode 100644 scripts/bucket-sql/iran-item2.sql delete mode 100644 scripts/bucket-sql/ned-b3.sql delete mode 100644 scripts/bucket-sql/schema delete mode 100755 scripts/cleanup.py delete mode 100755 scripts/delete-sqs-queues delete mode 100644 scripts/nuke-infohashes/README delete mode 100644 scripts/nuke-infohashes/buckets delete mode 100644 scripts/nuke-infohashes/infohashes delete mode 100755 scripts/nuke-infohashes/nuke-infohash diff --git a/scripts/bucket-sql/.gitignore b/scripts/bucket-sql/.gitignore deleted file mode 100644 index 890b30e..0000000 --- a/scripts/bucket-sql/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.json* -replica.db* -replica/ -getlantern-replica-frankfurt/ diff --git a/scripts/bucket-sql/Makefile b/scripts/bucket-sql/Makefile deleted file mode 100644 index 0539972..0000000 --- a/scripts/bucket-sql/Makefile +++ /dev/null @@ -1,56 +0,0 @@ -# You should invoke this passing BUCKET. See the README. - -# stop attempts to make .json.o (wtf?) -.SUFFIXES: - -BUCKET_JSON_FILE = $(BUCKET).json - -define INSERT_SQL -insert into object \ - select value->>'Key', value->>'LastModified', value->>'ETag', value->>'Size', '$(BUCKET)' \ - from json_each(readfile('$$file')); -endef - -SQLITE_CMD := sqlite3 -bail -cmd '.changes on' replica.db - -init-schema: - sqlite3 replica.db < schema - -# This deletes any existing files for BUCKET, then adds them using the chunked JSON files. Run this for each bucket you want included in the sqlite db. Depending on the .0 file causes us to recursively generate the rest of the files first. There are multiple files because sqlite barfs on JSON over 1GB. -import-bucket: $(BUCKET_JSON_FILE).0 - $(SQLITE_CMD) "delete from object where bucket='$(BUCKET)'" - for file in $(BUCKET_JSON_FILE).*; do echo $$file; $(SQLITE_CMD) <<< "$(INSERT_SQL)"; done - -# Unadulterated output from s3api list-objects-v2, with a .Contents array. -%.json: - aws-endpoint-env --profile '$(PROFILE)' s3api list-objects-v2 --bucket $* > $@~ - mv $@~ $@ - -# We need BUCKET_JSON_FILE, but we don't actually want to trigger on it. -.PHONY: rm-split-files -rm-split-files: $(BUCKET_JSON_FILE) - -rm -v $(BUCKET_JSON_FILE).* - -# If we're calling this, start over by wiping them first. -split-json: rm-split-files $(BUCKET_JSON_FILE).0 - -# This is recursive. It probably should pass make flags. -$(BUCKET_JSON_FILE).%: $(BUCKET_JSON_FILE) - jq ".Contents[$*000000:$$((($*+1)*1000000))]" $< > $@ - [[ $$(jq length $@) == 0 ]] || make $(BUCKET_JSON_FILE).$$(($*+1)) - -.PHONY: get-metadata -get-metadata: - # aws-endpoint-env is a wrapper around aws that lets me provide an endpoint url from the environment. There's probably a much better way to do this, like having AWS as make variable. - aws-endpoint-env --profile $(PROFILE) s3 sync s3://$(BUCKET) $(BUCKET) --exclude '*' --include '*/metadata' - -$(BUCKET): - make get-metadata - -import-metadata: $(BUCKET) - for info_hash_hex in `ls $(BUCKET)`; do \ -`# echo $$info_hash_hex || break;` \ - sqlite3 replica.db <<< \ - "replace into metadata (info_hash_hex, metadata) values ('$$info_hash_hex', readfile('$(BUCKET)/$$info_hash_hex/metadata'))" \ - || break; \ - done diff --git a/scripts/bucket-sql/README b/scripts/bucket-sql/README deleted file mode 100644 index bee4734..0000000 --- a/scripts/bucket-sql/README +++ /dev/null @@ -1,19 +0,0 @@ -This directory contains code to generate a SQLite DB from Replica S3 bucket listings. This can be used for high-level queries like in https://github.com/getlantern/lantern-internal/issues/5675. - -To get started run - - make init-schema - AWS_ENDPOINT_URL=https://1c693b3f1031ed33f68653b1e67dfbef.r2.cloudflarestorage.com make import-bucket BUCKET=replica PROFILE=replica-r2 - -where BUCKET suits your system, and PROFILE is a profile to use from your AWS config. On my system I had a wrapper around aws to allow custom endpoints from the environment, you might need something similar. Run this for all the buckets you want to import. You can clear the JSON files and rerun to refresh buckets. - -Examples: - --- size and count of all files relating to user uploads - -sqlite> select sum(size)/1e9 as size_gb, count(*) as num_objects from info_hash where info_hash in (select info_hash from info_hash join user_upload using (key)); -+---------------+-------------+ -| size_gb | num_objects | -+---------------+-------------+ -| 335.021089321 | 204825 | -+---------------+-------------+ diff --git a/scripts/bucket-sql/iran-item1.sql b/scripts/bucket-sql/iran-item1.sql deleted file mode 100644 index db3aae0..0000000 --- a/scripts/bucket-sql/iran-item1.sql +++ /dev/null @@ -1,13 +0,0 @@ - -- TODO: Link to item in grant that this solves. -with external_uploads as ( - select - strftime('%Y-%m', datetime(metadata->'retrieval_timestamp', 'unixepoch')) as month, - metadata->>'external_source' as source - from metadata -) -select - count(*), - coalesce(source->>'description', source->>'alternative_channel_name', source->>'url') as human -from external_uploads -where source is not null and month between '2023-01' and '2023-03' -group by human; \ No newline at end of file diff --git a/scripts/bucket-sql/iran-item2.sql b/scripts/bucket-sql/iran-item2.sql deleted file mode 100644 index 2eb0acf..0000000 --- a/scripts/bucket-sql/iran-item2.sql +++ /dev/null @@ -1,15 +0,0 @@ - -- TODO: Link to item in grant that this solves. -with uploads as ( - select - strftime('%Y-%m', datetime(metadata->'creation_timestamp', 'unixepoch')) as month, - metadata->>'request_country' as country, - metadata->>'uploader' as uploader, - * - from metadata -) -select - count(*) -from uploads -where country='IR' and uploader='AnonymousEndpoint' -and month between '2023-01' and '2023-03' -; diff --git a/scripts/bucket-sql/ned-b3.sql b/scripts/bucket-sql/ned-b3.sql deleted file mode 100644 index 0240775..0000000 --- a/scripts/bucket-sql/ned-b3.sql +++ /dev/null @@ -1,20 +0,0 @@ - -- Query for https://github.com/getlantern/grants/issues/545. - - -- 1-based months actually trivially gives us the NED offset quarters. - with - deduped as ( - select min(last_modified) as first_appearance, size from info_name group by key - ), - year_mo as ( - select strftime('%Y', first_appearance) as year, strftime('%m', first_appearance) as month, sum(size)/1e9 as size_gb, count(*) num_files from deduped group by year, month - ), - by_quarter as ( - select sum(size_gb) as size_gb, sum(num_files) as num_files, (year*12+month)/3 as quarter from year_mo group by quarter - ) -select - -- This is some bullshit to work around 1-based months. I think it works, but the alternative is to use 0-based months earlier. - (quarter*3-1)/12 as year, - (quarter*3-1)%12+1 as month, - sum(size_gb) over win as cum_size_gb, - sum(num_files) over win as cum_num_files -from by_quarter window win as (order by quarter); diff --git a/scripts/bucket-sql/schema b/scripts/bucket-sql/schema deleted file mode 100644 index c69f260..0000000 --- a/scripts/bucket-sql/schema +++ /dev/null @@ -1,35 +0,0 @@ -CREATE TABLE object(key, last_modified, etag, size, bucket); -create table metadata (info_hash_hex primary key, metadata); - -CREATE VIEW data_suf as - with - first_slash as ( - select instr(key, '/') as first_slash_index, * - from object ), - second_slash as ( - select instr(substr(key, first_slash_index+1), '/') as second_slash_index, * - from first_slash where first_slash_index<>0 ) - select - substr(key, first_slash_index+second_slash_index+1) as data_suf, - * - from second_slash - where substr(key, first_slash_index+1, second_slash_index-1)='data' -/* data_suf(data_suf,second_slash_index,first_slash_index,"key",last_modified,etag,size,bucket) */; -CREATE VIEW info_name as - with - first_slash as ( - select instr(data_suf, '/') as first_slash, * - from data_suf ) - select - iif(first_slash=0, data_suf, substr(data_suf, 1, first_slash-1)) as info_name, - * - from first_slash -/* info_name(info_name,first_slash,data_suf,second_slash_index,first_slash_index,"key",last_modified,etag,size,bucket) */; -CREATE VIEW user_upload as - select * from info_name where instr(info_name, '_')=0 and instr(info_name, ' ')=0 -/* user_upload(info_name,first_slash,data_suf,second_slash_index,first_slash_index,"key",last_modified,etag,size,bucket) */; -CREATE VIEW info_hash as - select substr(key, 1, 40) as info_hash, * - from object - where instr(key, '/')=41 or length(key)=40 -/* info_hash(info_hash,"key",last_modified,etag,size,bucket) */; diff --git a/scripts/cleanup.py b/scripts/cleanup.py deleted file mode 100755 index 770cc13..0000000 --- a/scripts/cleanup.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 - -# This contains stuff relating to cleaning up Replica S3 stuff. Currently -# that's just removing subscriptions that have no endpoints (Amazon says -# somewhere they're supposed to clean these up for you?). You should clean up -# the SQS queues first. - -import boto3 -import logging - -sqs = boto3.client("sqs") -sns = boto3.client("sns") - - -def all_subscriptions(): - client = sns - kwargs = dict( - TopicArn="arn:aws:sns:ap-southeast-1:670960738222:replica-search-events" - ) - while True: - response = client.list_subscriptions_by_topic(**kwargs) - for sub in response["Subscriptions"]: - yield sub - if "NextToken" not in response: - break - kwargs["NextToken"] = response["NextToken"] - - -def arn_exists(arn): - _, _, service, _, _, name = arn.split(":") - if service != "sqs": - raise ValueError("unexpected service", service) - try: - sqs.get_queue_url(QueueName=name) - except sqs.exceptions.QueueDoesNotExist: - return False - return True - - -def remove_orphaned_subs(): - for sub in all_subscriptions(): - if arn_exists(sub["Endpoint"]): - logging.info("leaving subscription for %r", sub["Endpoint"]) - continue - logging.info("deleting subscription for %r", sub["Endpoint"]) - sns.unsubscribe(SubscriptionArn=sub["SubscriptionArn"]) - - -def main(): - logging.basicConfig(level=logging.DEBUG) - logging.getLogger("botocore").setLevel(logging.INFO) - logging.getLogger("urllib3").setLevel(logging.INFO) - remove_orphaned_subs() - - -if __name__ == "__main__": - main() diff --git a/scripts/delete-sqs-queues b/scripts/delete-sqs-queues deleted file mode 100755 index 83c4546..0000000 --- a/scripts/delete-sqs-queues +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# This can be used to delete queues that meet certain criteria, around pending -# messages (implying that nothing has consumed them for a while), and queue -# modification age. It's useful as a reference for structuring aws cli -# commands too. I probably should have just written in Python to begin with, -# but that's not as easy to work with interactively to discover how to do this -# stuff. - -set -eux -: "${AWS_PARALLELISM:=10}" -: "${XARGS_PARAMS:=-L 1 -P $AWS_PARALLELISM}" -MAX_AGE_TIMESTAMP="$(date --date "$MAX_AGE" +%s)" - -aws sqs list-queues --queue-name-prefix "$QUEUE_NAME_PREFIX" \ -| jq '.QueueUrls | .[]' \ -| xargs ${XARGS_PARAMS[@]} aws sqs get-queue-attributes --attribute-names All --queue-url \ -| jq ".Attributes | select(.ApproximateNumberOfMessages | tonumber >= ${MIN_PENDING_MESSAGES:-2000}) | select(.LastModifiedTimestamp | tonumber < $MAX_AGE_TIMESTAMP)" \ -| jq '.QueueArn | split(":")[-1]' \ -| xargs ${XARGS_PARAMS[@]} aws sqs get-queue-url --queue-name \ -| jq '.QueueUrl' \ -| tee /dev/tty \ -| xargs ${XARGS_PARAMS[@]} aws sqs delete-queue --queue-url diff --git a/scripts/nuke-infohashes/README b/scripts/nuke-infohashes/README deleted file mode 100644 index 5b7088e..0000000 --- a/scripts/nuke-infohashes/README +++ /dev/null @@ -1 +0,0 @@ -See the script nuke-infohash. Other files are included as samples. diff --git a/scripts/nuke-infohashes/buckets b/scripts/nuke-infohashes/buckets deleted file mode 100644 index 2c43de1..0000000 --- a/scripts/nuke-infohashes/buckets +++ /dev/null @@ -1,3 +0,0 @@ -getlantern-replica replica-searcher -getlantern-replica-frankfurt frankfurt-anacrolix -getlantern-replica-staging replica-staging diff --git a/scripts/nuke-infohashes/infohashes b/scripts/nuke-infohashes/infohashes deleted file mode 100644 index 3bddc01..0000000 --- a/scripts/nuke-infohashes/infohashes +++ /dev/null @@ -1,6 +0,0 @@ -e4e0d13fd2656db6a7e326adcbc29cd52e8535db -4cde18918e4e384a478f0ed2cf33d59c183c0493 -03e901ddff275c7a571aa351b740a4cca22b977b -ea66519779cf1793c0acd09404ddbb56546965f1 -84abb3b8a81bf2cfc26d1e261046a990383a9965 -91373e4cb490ac65050aa3c2f046d320744a5553 diff --git a/scripts/nuke-infohashes/nuke-infohash b/scripts/nuke-infohashes/nuke-infohash deleted file mode 100755 index c637468..0000000 --- a/scripts/nuke-infohashes/nuke-infohash +++ /dev/null @@ -1,19 +0,0 @@ -set -ue - -# Takes a file of , then a file of infohashes, then any other parameters are passed directly to aws s3 rm (you might want to pass --dryrun to check what will happen). - -s3=$1 -shift -infohashes=$1 -shift - -while read infohash; do - # Skip empty lines - [ -n "$infohash" ] || continue - # Don't allow something that isn't an infohash, to avoid deleting an entire bucket (the trailing slash should prevent that though). - (( ${#infohash} == 40 )) || { printf >&2 "bad infohash '%q'\n" "$infohash"; exit 1; } - while read bucket profile; do - #echo $profile $bucket $infohash - aws s3 rm "$@" --recursive --profile "$profile" "s3://$bucket/$infohash/" - done < "$s3" -done < "$infohashes"