-
Notifications
You must be signed in to change notification settings - Fork 46
/
compose.yaml
172 lines (157 loc) · 5.59 KB
/
compose.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# For more detailed documentation, see README.md
#
# To start, run:
# % docker compose up
#
# What you should see:
# 1. wait while containers are downloaded (speed depends on your network), or nothing once cached
# 2a. crawler should run and download 1 sample pdf
# 2b. demo ui starting (in parallel with 2a)
# 3. lots of logging as opensearch starts. Eventually 'Waiting for opensearch to exit.'
# 4. sycamore container downloading pytorch files then running ray to import
# 5. UI ready to go on port localhost:3000 (it's accessible before this but won't work)
#
# Troubleshooting/Extensions:
# 1. Something is messed up, I just want to reset:
# docker compose run reset
# 2. I want more sort benchmark files:
# docker compose run sycamore_crawler_http_sort_all
version: '3'
services:
demo-ui:
image: arynai/sycamore-demo-ui:${UI_VERSION}
ports:
- 127.0.0.1:${UI_PORT}:3000 # UI port
environment:
- OPENAI_API_KEY
- OPENSEARCH_HOST=opensearch
- SSL # Set to 1 if you want self-signed certificates
volumes:
- crawl_data:/app/.scrapy
opensearch:
image: arynai/sycamore-opensearch:${OPENSEARCH_VERSION}
ports:
- 127.0.0.1:${OPENSEARCH_PORT}:9200 # opensearch port, directly accessed by UI
volumes:
- opensearch_data:/usr/share/opensearch/data
environment:
- OPENAI_API_KEY
# To help with debugging issues with opensearch starting.
- DEBUG
- NOEXIT
rps:
image: arynai/sycamore-remote-processor-service:${RPS_VERSION}
ports:
- 127.0.0.1:${RPS_PORT}:2796
environment:
- SSL_HOSTNAME=rps
# will probably want to add a config bind mount when rps->sycamore merge
jupyter:
image: arynai/sycamore-jupyter:${JUPYTER_VERSION}
ports:
- 127.0.0.1:${JUPYTER_PORT}:8888
volumes:
- crawl_data:/app/work/crawl_data
- jupyter_data:/app/work/docker_volume
- type: bind
source: ${JUPYTER_BIND_DIR}
target: /app/work/bind_dir
# example of how to configure a sycamore checkout to map into the jupyter notebook so that a more recent version
# of sycamore will be present.
# - type: bind
# source: /home/eric/projects/sycamore-main
# target: /app/sycamore.git
environment:
- OPENAI_API_KEY
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- AWS_SESSION_TOKEN
- AWS_CREDENTIAL_EXPIRATION
- JUPYTER_S3_BUCKET
- JUPYTER_S3_PREFIX
- JUPYTER_CONFIG_RESET # set this to 'yes' to reset the configuration
- SSL # Set to 1 if you want self-signed certificates
### Optional profiles below here
reset:
profiles: ['reset']
image: ubuntu
volumes:
- opensearch_data:/tmp/opensearch_data
- crawl_data:/tmp/crawl_data
command: /bin/sh -c 'echo size-before && du -s /tmp/* && rm -rf /tmp/opensearch_data/* /tmp/crawl_data/* && chown 1000:1000 /tmp/crawl_data && echo size-after && du -s /tmp/*'
fixuser:
profiles: ['fixuser']
image: ubuntu
volumes:
- crawl_data:/tmp/crawl_data
command: /bin/sh -c 'echo "starting chown..." && chown -R 1000:1000 /tmp/crawl_data && echo "chown successful"'
# Note: If you run
# aws s3 cp s3://aryn-public/query-ui-cache/ntsb-2024-09-11/cache.db apps/query-ui/cache_dir
# before starting the container, it will make the queries
# "How many accidents in WA" and "How many accidents in washington" significantly faster since
# it will skip calling the LLM on the input data.
query-ui:
# TODO: look into https://www.joyfulbikeshedding.com/blog/2021-03-15-docker-and-the-host-filesystem-owner-matching-problem.html
# as another solution to the uid/gid problem.
profiles: ['query-ui']
image: arynai/sycamore-query-ui:${QUERY_UI_VERSION}
ports:
- 127.0.0.1:${QUERY_UI_PORT}:8501
volumes:
- type: bind
source: apps/query-ui/cache_dir
target: /app/work/cache_dir
environment:
- OPENAI_API_KEY
sycamore_crawler_http_sort_one:
profiles: ['sort-one']
image: arynai/sycamore-crawler-http:${SYCAMORE_CRAWLER_HTTP_VERSION}
volumes:
- crawl_data:/app/.scrapy
sycamore_crawler_http_sort_all:
profiles: ['sort-all']
image: arynai/sycamore-crawler-http:${SYCAMORE_CRAWLER_HTTP_VERSION}
volumes:
- crawl_data:/app/.scrapy
command: -a preset=sort
sycamore_crawler_http:
profiles: ['crawl-http']
image: arynai/sycamore-crawler-http:${SYCAMORE_CRAWLER_HTTP_VERSION}
volumes:
- crawl_data:/app/.scrapy
command: help
sycamore_crawler_s3:
profiles: ['crawl-s3']
image: arynai/sycamore-crawler-s3:${SYCAMORE_CRAWLER_S3_VERSION}
volumes:
- crawl_data:/app/.data/.s3
environment:
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- AWS_SESSION_TOKEN
- AWS_CREDENTIAL_EXPIRATION
command: -h
importer:
profiles: ['deprecated_importer']
image: arynai/sycamore-importer:${SYCAMORE_VERSION}
ports:
- 127.0.0.1:${RAY_CONSOLE_PORT}:8265 # note, not working right now
volumes:
- crawl_data:/app/.scrapy
environment:
- OPENAI_API_KEY
# a bucket prefix like s3://example or s3://example/prefix/bits
# textract uploaded files will be under the prefix
# recommend setting lifecycle rules on the bucket to delete old documents
- SYCAMORE_TEXTRACT_PREFIX
# to get the next 4 environment variables:
# % aws sso login
# % eval "$(aws configure export-credentials --format env)"
- AWS_ACCESS_KEY_ID
- AWS_SECRET_ACCESS_KEY
- AWS_SESSION_TOKEN
- AWS_CREDENTIAL_EXPIRATION
volumes:
opensearch_data:
crawl_data:
jupyter_data: