Skip to content
This repository has been archived by the owner on Nov 15, 2021. It is now read-only.

Commit

Permalink
Merge pull request #43 from wisdomify/feature_42
Browse files Browse the repository at this point in the history
Feature 42
  • Loading branch information
eubinecto authored Nov 9, 2021
2 parents dc40bed + 085c3ae commit bdf064c
Show file tree
Hide file tree
Showing 54 changed files with 623 additions and 545 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

corpora
.metaflow
artifacts
wandb
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
71 changes: 71 additions & 0 deletions main_build_wisdom2def.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import io
import wandb
import pandas as pd
from metaflow import FlowSpec, step, Parameter
from wandb.integration.metaflow import wandb_log
from storyteller.constants import WISDOM2DEF_RAW_A, WISDOM2DEF_RAW_B, WANDB_PROJECT
from storyteller.preprocess import cleanse, normalise, augment, upsample
from storyteller.utils import get_url


class BuildWisdom2DefFlow(FlowSpec):
ver: str = Parameter('ver',
type=str,
help='The version of this artifact. Should be a single alphabet',
default="a")

raw_df: pd.DataFrame
all_df: pd.DataFrame

@step
def start(self):
"""
set ver to be available
"""
self.next(self.download)

@step
def download(self):
"""
ver -> raw_df
"""
if self.ver == "a":
text = get_url(WISDOM2DEF_RAW_A)
elif self.ver == "b":
text = get_url(WISDOM2DEF_RAW_B)
else:
raise ValueError
self.raw_df = pd.read_csv(io.StringIO(text), delimiter="\t")
self.next(self.preprocess)

@step
def preprocess(self):
"""
raw_df -> all_df
"""
self.all_df = self.raw_df \
.pipe(cleanse) \
.pipe(normalise) \
.pipe(augment) \
.pipe(upsample)
self.next(self.end)

@step
@wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
def end(self):
"""
raw_df, all_df
-> raw_table, all_table
-> artifact: upload this
"""
artifact = wandb.Artifact("wisdom2def", type="dataset")
raw_table = wandb.Table(dataframe=self.raw_df)
all_table = wandb.Table(dataframe=self.all_df)
# add the tables to the artifact
artifact.add(raw_table, "raw")
artifact.add(all_table, "all")
wandb.log_artifact(artifact, aliases=[self.ver, "latest"])


if __name__ == '__main__':
BuildWisdom2DefFlow()
87 changes: 87 additions & 0 deletions main_build_wisdom2eg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import json
from typing import List
import pandas as pd
import wandb
from tqdm import tqdm
from metaflow import FlowSpec, step, Parameter
from wandb.integration.metaflow import wandb_log
from storyteller.constants import WANDB_PROJECT
from storyteller.connectors import connect_to_es
from storyteller.downloaders import dl_wisdoms
from storyteller.elastic.docs import Story
from storyteller.elastic.searcher import Searcher
from storyteller.preprocess import parse, cleanse, normalise, augment, upsample


class BuildWisdom2EgFlow(FlowSpec):
ver: str = Parameter('ver',
type=str,
help='The version of this artifact. Should be a single alphabet',
default="a")

wisdoms: List[str]
raw_df: pd.DataFrame
all_df: pd.DataFrame

@step
def start(self):
"""
set ver to be available
"""
self.next(self.download)

@step
def download(self):
self.wisdoms = dl_wisdoms(self.ver)
self.next(self.search)

@step
def search(self):
"""
ver -> raw_df
"""
# ---
rows = list()
with connect_to_es() as es:
searcher = Searcher(es)
for wisdom in tqdm(self.wisdoms, desc="searching for wisdoms on stories...",
total=len(self.wisdoms)):
raw = searcher(wisdom, ",".join(Story.all_indices()), size=10000)
# https://stackoverflow.com/a/18337754
raw = json.dumps(raw, ensure_ascii=False)
rows.append((wisdom, raw))
self.raw_df = pd.DataFrame(data=rows, columns=["wisdom", "eg"])
self.next(self.preprocess)

@step
def preprocess(self):
"""
raw_df -> all_df
"""
self.all_df = self.raw_df \
.pipe(parse) \
.pipe(cleanse) \
.pipe(normalise) \
.pipe(augment) \
.pipe(upsample)
self.next(self.end)

@step
@wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
def end(self):
"""
raw_df, all_df
-> raw_table, all_table
-> wisdom2eg_artifact
"""
artifact = wandb.Artifact("wisdom2eg", type="dataset")
raw_table = wandb.Table(dataframe=self.raw_df)
all_table = wandb.Table(dataframe=self.all_df)
# add the tables to the artifact
artifact.add(raw_table, "raw")
artifact.add(all_table, "all")
wandb.log_artifact(artifact, aliases=[self.ver, "latest"])


if __name__ == '__main__':
BuildWisdom2EgFlow()
95 changes: 95 additions & 0 deletions main_build_wisdom2query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import io
import wandb
import pandas as pd
from metaflow import FlowSpec, step, Parameter
from wandb.integration.metaflow import wandb_log
from storyteller.utils import get_url
from storyteller.constants import WISDOM2QUERY_RAW_A, WANDB_PROJECT
from storyteller.preprocess import cleanse, normalise, stratified_split


class BuildWisdom2QueryFlow(FlowSpec):
# get the version of this artifact from command line
# """https://github.com/Netflix/metaflow/issues/175#issuecomment-610518458"""
ver: str = Parameter('ver',
type=str,
help='The version of this artifact. Should be a single alphabet',
default="a")

val_ratio = Parameter('val_ratio',
type=float,
help='The percentage of the validation set',
default=0.2)
seed = Parameter('seed',
type=int,
help='random seed',
default=410)
# --- to be saved locally --- #
raw_df: pd.DataFrame
all_df: pd.DataFrame
val_df: pd.DataFrame
test_df: pd.DataFrame

@step
def start(self):
"""
set ver to be available
"""
self.next(self.download)

@step
def download(self):
"""
ver -> raw_df
"""
if self.ver == "a":
text = get_url(WISDOM2QUERY_RAW_A)
else:
raise ValueError
self.raw_df = pd.read_csv(io.StringIO(text), delimiter="\t")
self.next(self.preprocess)

@step
def preprocess(self):
"""
raw_df -> all_df
"""
self.all_df = self.raw_df \
.pipe(cleanse) \
.pipe(normalise)
self.next(self.val_test_split)

@step
def val_test_split(self):
"""
all_df -> val_df, test_df
"""
self.val_ratio: float
self.seed: int
self.val_df, self.test_df = stratified_split(self.raw_df, self.val_ratio, self.seed)
self.next(self.end)

@step
@wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
def end(self):
"""
raw_df, all_df, val_df, test_df
-> raw_table, all_table, val_table, test_table
-> artifact: upload this
"""
artifact = wandb.Artifact("wisdom2query", type="dataset")
artifact.metadata = {"ver": self.ver, "seed": self.seed}
raw_table = wandb.Table(dataframe=self.raw_df)
all_table = wandb.Table(dataframe=self.all_df)
val_table = wandb.Table(dataframe=self.val_df)
test_table = wandb.Table(dataframe=self.test_df)
# add the tables to the artifact
artifact.add(raw_table, "raw")
artifact.add(all_table, "all")
artifact.add(val_table, "val")
artifact.add(test_table, "test")
wandb.log_artifact(artifact, aliases=[self.ver, "latest"])


if __name__ == '__main__':
BuildWisdom2QueryFlow()
54 changes: 54 additions & 0 deletions main_build_wisdoms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import io
import wandb
import pandas as pd
from wandb.integration.metaflow import wandb_log
from metaflow import FlowSpec, step, Parameter
from storyteller.utils import get_url
from storyteller.constants import WISDOMS_A, WISDOMS_B, WANDB_PROJECT


class BuildWisdomsFlow(FlowSpec):
# get the version of this artifact from command line
# """https://github.com/Netflix/metaflow/issues/175#issuecomment-610518458"""
ver: str = Parameter('ver',
type=str,
help='The version of this artifact. Should be a single alphabet',
default="a")

# --- to be saved locally --- #
all_df: pd.DataFrame

@step
def start(self):
self.next(self.download)

@step
def download(self):
"""
ver -> all_df
"""
if self.ver == "a":
text = get_url(WISDOMS_A)
elif self.ver == "b":
text = get_url(WISDOMS_B)
else:
raise ValueError
self.all_df = pd.read_csv(io.StringIO(text), delimiter="\t")
self.next(self.end)

@step
@wandb_log(settings=wandb.Settings(project=WANDB_PROJECT))
def end(self):
"""
package the dataframes into an artifact
"""
artifact = wandb.Artifact(name="wisdoms", type="dataset")
table = wandb.Table(dataframe=self.all_df)
artifact.add(table, name="all")
wandb.log_artifact(artifact, aliases=[self.ver, "latest"])


if __name__ == '__main__':
# --- we register them here so that .metaflow directory is created under storyteller/main --- #
BuildWisdomsFlow()

Loading

0 comments on commit bdf064c

Please sign in to comment.