Merge pull request #43 from wisdomify/feature_42

Feature 42
wisdomify · Nov 9, 2021 · bdf064c · bdf064c
2 parents dc40bed + 085c3ae
commit bdf064c
Show file tree

Hide file tree

Showing 54 changed files with 623 additions and 545 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+
+corpora
+.metaflow
+artifacts
+wandb
diff --git a/storyteller/explore/explore_chunked.py → explore/explore_chunked.py b/storyteller/explore/explore_chunked.py → explore/explore_chunked.py
diff --git a/storyteller/explore/explore_corpora_bs.py → explore/explore_corpora_bs.py b/storyteller/explore/explore_corpora_bs.py → explore/explore_corpora_bs.py
diff --git a/storyteller/explore/explore_corpora_cvc.py → explore/explore_corpora_cvc.py b/storyteller/explore/explore_corpora_cvc.py → explore/explore_corpora_cvc.py
diff --git a/storyteller/explore/explore_corpora_ds.py → explore/explore_corpora_ds.py b/storyteller/explore/explore_corpora_ds.py → explore/explore_corpora_ds.py
diff --git a/storyteller/explore/explore_corpora_gk.py → explore/explore_corpora_gk.py b/storyteller/explore/explore_corpora_gk.py → explore/explore_corpora_gk.py
diff --git a/storyteller/explore/explore_corpora_kc.py → explore/explore_corpora_kc.py b/storyteller/explore/explore_corpora_kc.py → explore/explore_corpora_kc.py
diff --git a/storyteller/explore/explore_corpora_kcss.py → explore/explore_corpora_kcss.py b/storyteller/explore/explore_corpora_kcss.py → explore/explore_corpora_kcss.py
diff --git a/storyteller/explore/explore_corpora_kept.py → explore/explore_corpora_kept.py b/storyteller/explore/explore_corpora_kept.py → explore/explore_corpora_kept.py
diff --git a/storyteller/explore/explore_corpora_kess.py → explore/explore_corpora_kess.py b/storyteller/explore/explore_corpora_kess.py → explore/explore_corpora_kess.py
diff --git a/storyteller/explore/explore_corpora_kets.py → explore/explore_corpora_kets.py b/storyteller/explore/explore_corpora_kets.py → explore/explore_corpora_kets.py
diff --git a/storyteller/explore/explore_corpora_kj.py → explore/explore_corpora_kj.py b/storyteller/explore/explore_corpora_kj.py → explore/explore_corpora_kj.py
diff --git a/...ler/explore/explore_corpora_korea_univ.py → explore/explore_corpora_korea_univ.py b/...ler/explore/explore_corpora_korea_univ.py → explore/explore_corpora_korea_univ.py
diff --git a/storyteller/explore/explore_corpora_ksns.py → explore/explore_corpora_ksns.py b/storyteller/explore/explore_corpora_ksns.py → explore/explore_corpora_ksns.py
diff --git a/storyteller/explore/explore_corpora_mr.py → explore/explore_corpora_mr.py b/storyteller/explore/explore_corpora_mr.py → explore/explore_corpora_mr.py
diff --git a/storyteller/explore/explore_corpora_sc.py → explore/explore_corpora_sc.py b/storyteller/explore/explore_corpora_sc.py → explore/explore_corpora_sc.py
diff --git a/storyteller/explore/explore_corpora_sfc.py → explore/explore_corpora_sfc.py b/storyteller/explore/explore_corpora_sfc.py → explore/explore_corpora_sfc.py
diff --git a/storyteller/explore/explore_corpora_sfke.py → explore/explore_corpora_sfke.py b/storyteller/explore/explore_corpora_sfke.py → explore/explore_corpora_sfke.py
diff --git a/storyteller/explore/explore_doc.py → explore/explore_doc.py b/storyteller/explore/explore_doc.py → explore/explore_doc.py
diff --git a/storyteller/explore/explore_split.py → explore/explore_split.py b/storyteller/explore/explore_split.py → explore/explore_split.py
diff --git a/...plore/explore_update_dataset_structure.py → explore/explore_update_dataset_structure.py b/...plore/explore_update_dataset_structure.py → explore/explore_update_dataset_structure.py
diff --git a/...e/explore_upload_gold_test_queries_wnb.py → ...e/explore_upload_gold_test_queries_wnb.py b/...e/explore_upload_gold_test_queries_wnb.py → ...e/explore_upload_gold_test_queries_wnb.py
diff --git a/...yteller/explore/explore_wandb_artifact.py → explore/explore_wandb_artifact.py b/...yteller/explore/explore_wandb_artifact.py → explore/explore_wandb_artifact.py
diff --git a/...xplore/explore_wandb_download_artifact.py → explore/explore_wandb_download_artifact.py b/...xplore/explore_wandb_download_artifact.py → explore/explore_wandb_download_artifact.py
diff --git a/storyteller/explore/explore_wandb_init.py → explore/explore_wandb_init.py b/storyteller/explore/explore_wandb_init.py → explore/explore_wandb_init.py
diff --git a/...ler/explore/explore_wandb_log_artifact.py → explore/explore_wandb_log_artifact.py b/...ler/explore/explore_wandb_log_artifact.py → explore/explore_wandb_log_artifact.py
diff --git a/...ler/explore/explore_wandb_use_artifact.py → explore/explore_wandb_use_artifact.py b/...ler/explore/explore_wandb_use_artifact.py → explore/explore_wandb_use_artifact.py
diff --git a/storyteller/explore/explore_wisdoms.py → explore/explore_wisdoms.py b/storyteller/explore/explore_wisdoms.py → explore/explore_wisdoms.py
diff --git a/main_build_wisdom2def.py b/main_build_wisdom2def.py
@@ -0,0 +1,71 @@
+import io
+import wandb
+import pandas as pd
+from metaflow import FlowSpec, step, Parameter
+from wandb.integration.metaflow import wandb_log
+from storyteller.constants import WISDOM2DEF_RAW_A, WISDOM2DEF_RAW_B, WANDB_PROJECT
+from storyteller.preprocess import cleanse, normalise, augment, upsample
+from storyteller.utils import get_url
+
+
+class BuildWisdom2DefFlow(FlowSpec):
+    ver: str = Parameter('ver',
+                         type=str,
+                         help='The version of this artifact. Should be a single alphabet',
+                         default="a")
+
+    raw_df: pd.DataFrame
+    all_df: pd.DataFrame
+
+    @step
+    def start(self):
+        """
+        set ver to be available
+        """
+        self.next(self.download)
+
+    @step
+    def download(self):
+        """
+        ver  -> raw_df
+        """
+        if self.ver == "a":
+            text = get_url(WISDOM2DEF_RAW_A)
+        elif self.ver == "b":
+            text = get_url(WISDOM2DEF_RAW_B)
+        else:
+            raise ValueError
+        self.raw_df = pd.read_csv(io.StringIO(text), delimiter="\t")
+        self.next(self.preprocess)
+
+    @step
+    def preprocess(self):
+        """
+        raw_df -> all_df
+        """
+        self.all_df = self.raw_df \
+                          .pipe(cleanse) \
+                          .pipe(normalise) \
+                          .pipe(augment) \
+                          .pipe(upsample)
+        self.next(self.end)
+
+    @step
+    @wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
+    def end(self):
+        """
+        raw_df, all_df
+        -> raw_table, all_table
+        -> artifact: upload this
+        """
+        artifact = wandb.Artifact("wisdom2def", type="dataset")
+        raw_table = wandb.Table(dataframe=self.raw_df)
+        all_table = wandb.Table(dataframe=self.all_df)
+        # add the tables to the artifact
+        artifact.add(raw_table, "raw")
+        artifact.add(all_table, "all")
+        wandb.log_artifact(artifact, aliases=[self.ver, "latest"])
+
+
+if __name__ == '__main__':
+    BuildWisdom2DefFlow()
diff --git a/main_build_wisdom2eg.py b/main_build_wisdom2eg.py
@@ -0,0 +1,87 @@
+import json
+from typing import List
+import pandas as pd
+import wandb
+from tqdm import tqdm
+from metaflow import FlowSpec, step, Parameter
+from wandb.integration.metaflow import wandb_log
+from storyteller.constants import WANDB_PROJECT
+from storyteller.connectors import connect_to_es
+from storyteller.downloaders import dl_wisdoms
+from storyteller.elastic.docs import Story
+from storyteller.elastic.searcher import Searcher
+from storyteller.preprocess import parse, cleanse, normalise, augment, upsample
+
+
+class BuildWisdom2EgFlow(FlowSpec):
+    ver: str = Parameter('ver',
+                         type=str,
+                         help='The version of this artifact. Should be a single alphabet',
+                         default="a")
+
+    wisdoms: List[str]
+    raw_df: pd.DataFrame
+    all_df: pd.DataFrame
+
+    @step
+    def start(self):
+        """
+        set ver to be available
+        """
+        self.next(self.download)
+
+    @step
+    def download(self):
+        self.wisdoms = dl_wisdoms(self.ver)
+        self.next(self.search)
+
+    @step
+    def search(self):
+        """
+        ver -> raw_df
+        """
+        # ---
+        rows = list()
+        with connect_to_es() as es:
+            searcher = Searcher(es)
+            for wisdom in tqdm(self.wisdoms, desc="searching for wisdoms on stories...",
+                               total=len(self.wisdoms)):
+                raw = searcher(wisdom, ",".join(Story.all_indices()), size=10000)
+                # https://stackoverflow.com/a/18337754
+                raw = json.dumps(raw, ensure_ascii=False)
+                rows.append((wisdom, raw))
+        self.raw_df = pd.DataFrame(data=rows, columns=["wisdom", "eg"])
+        self.next(self.preprocess)
+
+    @step
+    def preprocess(self):
+        """
+        raw_df -> all_df
+        """
+        self.all_df = self.raw_df \
+                          .pipe(parse) \
+                          .pipe(cleanse) \
+                          .pipe(normalise) \
+                          .pipe(augment) \
+                          .pipe(upsample)
+        self.next(self.end)
+
+    @step
+    @wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
+    def end(self):
+        """
+        raw_df, all_df
+        -> raw_table, all_table
+        -> wisdom2eg_artifact
+        """
+        artifact = wandb.Artifact("wisdom2eg", type="dataset")
+        raw_table = wandb.Table(dataframe=self.raw_df)
+        all_table = wandb.Table(dataframe=self.all_df)
+        # add the tables to the artifact
+        artifact.add(raw_table, "raw")
+        artifact.add(all_table, "all")
+        wandb.log_artifact(artifact, aliases=[self.ver, "latest"])
+
+
+if __name__ == '__main__':
+    BuildWisdom2EgFlow()
diff --git a/main_build_wisdom2query.py b/main_build_wisdom2query.py
@@ -0,0 +1,95 @@
+import io
+import wandb
+import pandas as pd
+from metaflow import FlowSpec, step, Parameter
+from wandb.integration.metaflow import wandb_log
+from storyteller.utils import get_url
+from storyteller.constants import WISDOM2QUERY_RAW_A, WANDB_PROJECT
+from storyteller.preprocess import cleanse, normalise, stratified_split
+
+
+class BuildWisdom2QueryFlow(FlowSpec):
+    # get the version of this artifact from command line
+    # """https://github.com/Netflix/metaflow/issues/175#issuecomment-610518458"""
+    ver: str = Parameter('ver',
+                         type=str,
+                         help='The version of this artifact. Should be a single alphabet',
+                         default="a")
+
+    val_ratio = Parameter('val_ratio',
+                          type=float,
+                          help='The percentage of the validation set',
+                          default=0.2)
+    seed = Parameter('seed',
+                     type=int,
+                     help='random seed',
+                     default=410)
+    # --- to be saved locally --- #
+    raw_df: pd.DataFrame
+    all_df: pd.DataFrame
+    val_df: pd.DataFrame
+    test_df: pd.DataFrame
+
+    @step
+    def start(self):
+        """
+        set ver to be available
+        """
+        self.next(self.download)
+
+    @step
+    def download(self):
+        """
+        ver  -> raw_df
+        """
+        if self.ver == "a":
+            text = get_url(WISDOM2QUERY_RAW_A)
+        else:
+            raise ValueError
+        self.raw_df = pd.read_csv(io.StringIO(text), delimiter="\t")
+        self.next(self.preprocess)
+
+    @step
+    def preprocess(self):
+        """
+        raw_df -> all_df
+        """
+        self.all_df = self.raw_df \
+                          .pipe(cleanse) \
+                          .pipe(normalise)
+        self.next(self.val_test_split)
+
+    @step
+    def val_test_split(self):
+        """
+        all_df -> val_df, test_df
+        """
+        self.val_ratio: float
+        self.seed: int
+        self.val_df, self.test_df = stratified_split(self.raw_df, self.val_ratio, self.seed)
+        self.next(self.end)
+
+    @step
+    @wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
+    def end(self):
+        """
+        raw_df, all_df, val_df, test_df
+        -> raw_table, all_table, val_table, test_table
+        -> artifact: upload this
+        """
+        artifact = wandb.Artifact("wisdom2query", type="dataset")
+        artifact.metadata = {"ver": self.ver, "seed": self.seed}
+        raw_table = wandb.Table(dataframe=self.raw_df)
+        all_table = wandb.Table(dataframe=self.all_df)
+        val_table = wandb.Table(dataframe=self.val_df)
+        test_table = wandb.Table(dataframe=self.test_df)
+        # add the tables to the artifact
+        artifact.add(raw_table, "raw")
+        artifact.add(all_table, "all")
+        artifact.add(val_table, "val")
+        artifact.add(test_table, "test")
+        wandb.log_artifact(artifact, aliases=[self.ver, "latest"])
+
+
+if __name__ == '__main__':
+    BuildWisdom2QueryFlow()
diff --git a/main_build_wisdoms.py b/main_build_wisdoms.py
@@ -0,0 +1,54 @@
+import io
+import wandb
+import pandas as pd
+from wandb.integration.metaflow import wandb_log
+from metaflow import FlowSpec, step, Parameter
+from storyteller.utils import get_url
+from storyteller.constants import WISDOMS_A, WISDOMS_B, WANDB_PROJECT
+
+
+class BuildWisdomsFlow(FlowSpec):
+    # get the version of this artifact from command line
+    # """https://github.com/Netflix/metaflow/issues/175#issuecomment-610518458"""
+    ver: str = Parameter('ver',
+                         type=str,
+                         help='The version of this artifact. Should be a single alphabet',
+                         default="a")
+
+    # --- to be saved locally --- #
+    all_df: pd.DataFrame
+
+    @step
+    def start(self):
+        self.next(self.download)
+
+    @step
+    def download(self):
+        """
+        ver  -> all_df
+        """
+        if self.ver == "a":
+            text = get_url(WISDOMS_A)
+        elif self.ver == "b":
+            text = get_url(WISDOMS_B)
+        else:
+            raise ValueError
+        self.all_df = pd.read_csv(io.StringIO(text), delimiter="\t")
+        self.next(self.end)
+
+    @step
+    @wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
+    def end(self):
+        """
+        package the dataframes into an artifact
+        """
+        artifact = wandb.Artifact(name="wisdoms", type="dataset")
+        table = wandb.Table(dataframe=self.all_df)
+        artifact.add(table, name="all")
+        wandb.log_artifact(artifact, aliases=[self.ver, "latest"])
+
+
+if __name__ == '__main__':
+    # --- we register them here so that .metaflow directory is created under storyteller/main --- #
+    BuildWisdomsFlow()
+