jina-ai · alanthssss · Jun 11, 2021 · Jun 11, 2021 · Jun 11, 2021 · Jun 11, 2021
diff --git a/.gitignore b/.gitignore
@@ -141,3 +141,7 @@ _logs/
 _rendered/
 *instances.yaml
 nohup.out
+
+# cache
+enwiki-latest-abstract.xml
+wiki_dump.gz
diff --git a/distributed/data.py b/distributed/data.py
@@ -5,7 +5,8 @@
 from typing import Dict, Callable
 
 from jina.parsers import set_client_cli_parser
-from jina.clients import Client, WebSocketClient
+from jina.clients import Client
+from jina.clients.websocket import WebSocketClient
 from pydantic import validate_arguments
 
 from logger import logger

diff --git a/distributed/helper.py b/distributed/helper.py
@@ -16,7 +16,8 @@
 import yaml
 import chevron
 import numpy as np
-from jina import Document, Request
+from jina import Document
+from jina.types.request import Request
 from jinacld_tools.aws.services.s3 import S3Bucket
 from pydantic import FilePath, validate_arguments
 

diff --git a/distributed/wiki/__init__.py b/distributed/wiki/__init__.py
diff --git a/distributed/wiki/annoy_indexer.yml b/distributed/wiki/annoy_indexer.yml
diff --git a/distributed/wiki/app.py b/distributed/wiki/app.py
@@ -0,0 +1,23 @@
+from jina import Flow, Document, DocumentArray
+
+f = Flow.load_config(
+    './local/index.yml'
+)
+
+d1 = Document(id=1, text='foo1 is foo fool full fu')
+d2 = Document(id=2, text='foo2 is foo fool full fu')
+d3 = Document(id=3, text='foo3 is foo fool full fu')
+
+
+def print_matches(req):  # the callback function invoked when task is done
+    for idx, d in enumerate(req.docs[0].matches[:3]):  # print top-3 matches
+        print(f'[{idx}]{d.score.value:2f}: "{d.text}"')
+
+
+with f:
+    f.index(inputs=DocumentArray([d1, d2, d3]))
+
+with Flow.load_config(
+    './local/query.yml'
+) as f:
+    f.search(inputs=d2, on_done=print_matches)
diff --git a/distributed/wiki/chunk_indexer.yml b/distributed/wiki/chunk_indexer.yml
diff --git a/distributed/wiki/chunk_merger.yml b/distributed/wiki/chunk_merger.yml
diff --git a/distributed/wiki/doc.yml b/distributed/wiki/doc.yml
diff --git a/distributed/wiki/encoder.yml b/distributed/wiki/encoder.yml
diff --git a/distributed/wiki/local/index.yml b/distributed/wiki/local/index.yml
@@ -1,39 +1,22 @@
 jtype: Flow
 version: '1'
 with:
-  rest_api: {{ JINA_GATEWAY_REST }}
-  port_expose: {{ JINA_GATEWAY_PORT_EXPOSE }}
+  workspace: $JINA_WORKDIR
+  py_modules:
+    - wiki_executors.py
 pods:
   - name: segmenter
-    polling: any
-    shards: {{ JINA_SEGMENTER_SHARDS }}
-    uses: segment.yml
-    scheduling: {{ JINA_SCHEDULING }}
-    read_only: true
-    timeout_ready: 100000
+    uses: 
+      jtype: Segmenter
   - name: encoder
-    polling: any
-    scheduling: {{ JINA_SCHEDULING }}
-    uses: encoder.yml
-    shards: {{ JINA_ENCODER_SHARDS }}
-    timeout_ready: 100000
-    read_only: true
+    uses:
+      jtype: TextEncoder
   - name: vec_idx
-    polling: any
-    scheduling: {{ JINA_SCHEDULING }}
-    uses: annoy_indexer.yml
-    shards: {{ JINA_VEC_INDEXER_SHARDS }}
-    timeout_ready: 100000
+    uses:
+      jtype: AnnoyIndexer
   - name: doc_idx
-    polling: any
-    scheduling: {{ JINA_SCHEDULING }}
-    uses: doc.yml
-    shards: {{ JINA_KV_INDEXER_SHARDS }}
-    needs: gateway
-    timeout_ready: 100000
+    uses:
+      jtype: KeyValueIndexer
+    needs: segmenter
   - name: join_all
-    method: needs
-    uses: _merge
-    needs: [ doc_idx, vec_idx ]
-    read_only: true
-    timeout_ready: 100000
+    needs: [vec_idx, doc_idx]
diff --git a/distributed/wiki/local/query.yml b/distributed/wiki/local/query.yml
@@ -1,36 +1,23 @@
 jtype: Flow
 version: '1'
 with:
-  read_only: true
-  rest_api: {{ JINA_GATEWAY_REST }}
-  port_expose: {{ JINA_GATEWAY_PORT_EXPOSE }}
+  workspace: $JINA_WORKDIR
+  py_modules:
+    - wiki_executors.py
 pods:
   - name: segmenter
-    polling: all
-    shards: {{ JINA_SEGMENTER_SHARDS }}
-    uses: segment.yml
-    read_only: true
+    uses: 
+      jtype: Segmenter
   - name: encoder
-    polling: all
-    scheduling: {{ JINA_SCHEDULING }}
-    uses: encoder.yml
-    shards: {{ JINA_ENCODER_SHARDS }}
-    uses_after: chunk_merger.yml
-    timeout_ready: -1
-    read_only: true
+    uses:
+      jtype: TextEncoder
   - name: vec_idx
-    scheduling: {{ JINA_SCHEDULING }}
-    uses: annoy_indexer.yml
-    shards: {{ JINA_VEC_INDEXER_SHARDS }}
-    polling: all
-    uses_after: chunk_merger.yml
-    timeout_ready: -1
-  - name: ranker
-    polling: all
-    shards: {{ JINA_RANKER_SHARDS }}
-    uses: ranker.yml
+    uses:
+      jtype: AnnoyIndexer
   - name: doc_idx
-    uses: doc.yml
-    shards: {{ JINA_KV_INDEXER_SHARDS }}
-    polling: all
-    timeout_ready: 100000
+    uses:
+      jtype: KeyValueIndexer
+  - name: ranker
+    uses:
+      jtype: AggregateRanker
+
diff --git a/distributed/wiki/ranker.yml b/distributed/wiki/ranker.yml
diff --git a/distributed/wiki/segment.yml b/distributed/wiki/segment.yml
diff --git a/distributed/wiki/segmenters.py b/distributed/wiki/segmenters.py