Skip to content

Commit

Permalink
DDEX crawler (#7470)
Browse files Browse the repository at this point in the history
  • Loading branch information
michellebrier authored Feb 6, 2024
1 parent 9e23123 commit 0934d73
Show file tree
Hide file tree
Showing 25 changed files with 375 additions and 250 deletions.
10 changes: 10 additions & 0 deletions dev-tools/compose/docker-compose.ddex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ services:
environment:
- DDEX_PORT=9000
- DDEX_MONGODB_URL=mongodb://mongo:mongo@ddex-mongo:27017/ddex?authSource=admin&replicaSet=rs0
env_file:
- ${PROJECT_ROOT}/packages/ddex/.env
ports:
- "9000:9000"
networks:
Expand All @@ -27,6 +29,8 @@ services:
dockerfile: ${PROJECT_ROOT}/packages/ddex/ingester/Dockerfile
environment:
- DDEX_MONGODB_URL=mongodb://mongo:mongo@ddex-mongo:27017/ddex?authSource=admin&replicaSet=rs0
env_file:
- ${PROJECT_ROOT}/packages/ddex/.env
depends_on:
ddex-mongo:
condition: service_healthy
Expand All @@ -43,6 +47,8 @@ services:
dockerfile: ${PROJECT_ROOT}/packages/ddex/ingester/Dockerfile
environment:
- DDEX_MONGODB_URL=mongodb://mongo:mongo@ddex-mongo:27017/ddex?authSource=admin&replicaSet=rs0
env_file:
- ${PROJECT_ROOT}/packages/ddex/.env
depends_on:
ddex-mongo:
condition: service_healthy
Expand All @@ -59,6 +65,8 @@ services:
dockerfile: ${PROJECT_ROOT}/packages/ddex/ingester/Dockerfile
environment:
- DDEX_MONGODB_URL=mongodb://mongo:mongo@ddex-mongo:27017/ddex?authSource=admin&replicaSet=rs0
env_file:
- ${PROJECT_ROOT}/packages/ddex/.env
depends_on:
ddex-mongo:
condition: service_healthy
Expand All @@ -79,6 +87,8 @@ services:
TURBO_TOKEN: '${TURBO_TOKEN}'
environment:
- DDEX_MONGODB_URL=mongodb://mongo:mongo@ddex-mongo:27017/ddex?authSource=admin&replicaSet=rs0
env_file:
- ${PROJECT_ROOT}/packages/ddex/.env
depends_on:
ddex-mongo:
condition: service_healthy
Expand Down
13 changes: 13 additions & 0 deletions packages/ddex/.env.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
NODE_ENV='dev'

AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION=
AWS_BUCKET_RAW=
AWS_BUCKET_INDEXED=

DDEX_KEY=
DDEX_SECRET=

# use stage optimizely
OPTIMIZELY_SDK_KEY='MX4fYBgANQetvmBXGpuxzF'
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
NODE_ENV='stage'

AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION='us-west-2'
AWS_BUCKET_RAW='ddex-dev-audius-raw'
AWS_BUCKET_INDEXED='ddex-dev-audius-indexed'

DDEX_KEY='49d5e13d355709b615b7cce7369174fb240b6b39'
DDEX_SECRET='2b2c2b90d9a489234ae629a5284de84fb0633306257f17667aaebf2345d92152'
OPTIMIZELY_SDK_KEY='MX4fYBgANQetvmBXGpuxzF'
43 changes: 43 additions & 0 deletions packages/ddex/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ Processes and uploads DDEX releases to Audius.
## Local Dev
DDEX requires these services: `ddex-webapp`, `ddex-crawler`, `ddex-indexer`, `ddex-parser`, `ddex-publisher`, `ddex-mongo`.

### Env configuration
All services read from `.env`.

To use stage envs: `cp .env.stage .env`

To use dev envs: `cp .env.dev .env`

Fill in all missing values. See the `Creating a bucket in S3` section below for how to set up S3.

### Setup
1. (At the monorepo root) Generate a keyfile for mongodb:
```
Expand All @@ -24,3 +33,37 @@ To access the ddex db via the mongo shell: `docker exec -it ddex-mongo mongosh -

### Develop with hot reloading
Each service can be run independently as long as `ddex-mongo` is up. See the respective subdirectories' READMEs.

### Creating a bucket in S3
1. Create a new bucket in the S3 console with the name `ddex-[dev|staging]-<label/distributor>-raw`. Use all the defaults, including "ACLs disabled"
2. Do the same for a bucket named `ddex-[dev|staging]-<label/distributor>-indexed`. Use all the defaults, including "ACLs disabled"
3. Create an IAM Policy (here](https://us-east-1.console.aws.amazon.com/iamv2/home?region=us-west-2#/policies/create) (or search IAM and click Policies > Create Policy). Select S3.
* Under `Read` choose `GetObject` and `GetObjectAttributes`.
* Under `Write` choose `DeleteObject` and `PutObject`.
* Under `List` choose `ListBucket`.
* Click `Add Arn` for object actions, enter the bucket name ending with `raw`, and check the box for `Any object name`.
* Click `Add Arn` for object actions again, enter the bucket name ending with `indexed`, and check the box for `Any object name`.
* Click `Add Arn` for bucket actions and enter the bucket name ending with `raw`.
* Click `Add Arn` for bucket actions again and enter the bucket name ending with `indexed`.
* Click Next, and then name the policy `ddex-[dev|staging]-<label/distributor>-policy`.
4. Create an IAM User [here](https://us-east-1.console.aws.amazon.com/iamv2/home?region=us-west-2#/users/create) (or search IAM and click Users > Create User).
* Name the user `ddex-[dev|staging]-<label/distributor>-user` and press Next.
* Select "Attach policies directly," and search for the policy you created (`ddex-[dev|staging]-<label/distributor>-policy`). Check the box next to it and press Next and then Create User.
5. Search for your new user and press "Create access key" and then "Third-party service." Copy the access key and secret access key into your .env file (assuming you've already done `cp .env.dev .env`).
6. Go back to the bucket ending with `raw`, and add CORS at the bottom of the Permissions tab. Here's an example for dev, but for a prod environment you'll wnat to replace "*" in "AllowedOrigins" with the DNS that the frontend will be served from:
```json
[
{
"AllowedHeaders": [
"*"
],
"AllowedMethods": [
"PUT"
],
"AllowedOrigins": [
"*"
],
"ExposeHeaders": []
}
]
```
6 changes: 3 additions & 3 deletions packages/ddex/ingester/.air.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ tmp_dir = "tmp"
include_dir = []
include_ext = ["go", "tpl", "tmpl", "html"]
include_file = []
kill_delay = "0s"
kill_delay = "5s"
log = "build-errors.log"
poll = false
poll_interval = 0
post_cmd = []
pre_cmd = []
rerun = false
rerun_delay = 500
send_interrupt = false
send_interrupt = true
stop_on_error = false

[color]
Expand All @@ -39,7 +39,7 @@ tmp_dir = "tmp"
time = false

[misc]
clean_on_exit = false
clean_on_exit = true

[screen]
clear_on_rebuild = false
Expand Down
1 change: 1 addition & 0 deletions packages/ddex/ingester/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ Indexes and parses new DDEX uploads.

To run an ingester service locally with hot reloading:
1. Make sure you can connect to mongo at `mongodb://mongo:mongo@localhost:27017/ddex`. See `packages/ddex/README.md` on how to spin up `ddex-mongo` and the other ddex containers.
2. Make sure you've configured your `packages/ddex/.env` and S3 buckets according to the toplevel DDEX README.
2. `air -c .air.toml -- --service [crawler|indexer|parser]`
38 changes: 30 additions & 8 deletions packages/ddex/ingester/cmd/main.go
Original file line number Diff line number Diff line change
@@ -1,33 +1,55 @@
package main

import (
"context"
"flag"
"fmt"
"ingester/crawler"
"ingester/indexer"
"ingester/parser"
"log"
"os"
"os/signal"
"syscall"

"github.com/joho/godotenv"
)

func main() {
service := flag.String("service", "", "Specify the service to run: crawler, indexer, or parser")
flag.Parse()

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

go func() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
sig := <-sigChan
log.Printf("Received signal: %v, shutting down...\n", sig)
cancel()
}()

err := godotenv.Load("../.env")
if err != nil {
if os.IsNotExist(err) {
log.Println("No .env file found, proceeding with existing environment variables")
} else {
log.Println("Error loading .env file:", err)
}
}

switch *service {
case "crawler":
crawler.Run()
go crawler.Run(ctx)
case "indexer":
indexer.Run()
go indexer.Run(ctx)
case "parser":
parser.Run()
go parser.Run(ctx)
default:
fmt.Println("Unknown service: " + *service)
// sleep
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
sig := <-sigChan
fmt.Printf("Received signal: %v, shutting down...\n", sig)
}

<-ctx.Done() // Wait until the context is canceled
log.Println("Service stopped")
}
54 changes: 54 additions & 0 deletions packages/ddex/ingester/common/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package common

import (
"context"
"log"
"os"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/credentials"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)

func InitMongoClient(ctx context.Context) *mongo.Client {
mongoUrl := os.Getenv("DDEX_MONGODB_URL")
if mongoUrl == "" {
mongoUrl = "mongodb://mongo:mongo@localhost:27017/ddex?authSource=admin&replicaSet=rs0"
}
client, err := mongo.Connect(ctx, options.Client().ApplyURI(mongoUrl))
if err != nil {
panic(err)
}
log.Println("Connected to mongo")
return client
}

func InitS3Client() *s3.S3 {
awsRegion := MustGetenv("AWS_REGION")
awsKey := MustGetenv("AWS_ACCESS_KEY_ID")
awsSecret := MustGetenv("AWS_SECRET_ACCESS_KEY")
sess, err := session.NewSession(&aws.Config{
Region: aws.String(awsRegion),
Credentials: credentials.NewStaticCredentials(awsKey, awsSecret, ""),
})
if err != nil {
panic(err)
}
log.Println("Connected to s3")
return s3.New(sess)
}

func MustGetenv(key string) string {
val := os.Getenv(key)
if val == "" {
log.Println("Missing required env variable: ", key, " sleeping ...")
// If config is incorrect, sleep a bit to prevent container from restarting constantly
time.Sleep(time.Hour)
log.Fatal("Missing required env variable: ", key)
}
return val
}
Loading

0 comments on commit 0934d73

Please sign in to comment.