Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wikidata fix and unit tests #932

Merged
merged 7 commits into from
Sep 17, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion notebooks/01_prepare_data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ data preparation tasks witnessed in recommendation system development.
| --- | --- |
| [data_split](data_split.ipynb) | Details on splitting data (randomly, chronologically, etc). |
| [data_transform](data_transform.ipynb) | Guidance on how to transform (implicit / explicit) data for building collaborative filtering typed recommender. |
| [wikidata knowledge graph](wikidata_KG.ipynb) | Details on how to create a knowledge graph using Wikidata |
| [wikidata knowledge graph](wikidata_knowledge_graph.ipynb) | Details on how to create a knowledge graph using Wikidata |

### Data split

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"metadata": {},
"source": [
"## Wikidata Knowledge Graph Extraction\n",
"Many recommendation algorithms (DKN, RippleNet, KGCN) use Knowledge Graphs as an external source of information. We found that one of the bottlenecks to benchmark current algorithms like DKN, RippleNet or KGCN is that they used Microsoft Satori. As Satori is not open source, it's not possible to replicate the results found in the papers. The solution is using other open source KGs.\n",
"Many recommendation algorithms (DKN, RippleNet, KGCN) use Knowledge Graphs (KGs) as an external source of information. We found that one of the bottlenecks to benchmark current algorithms like DKN, RippleNet or KGCN is that they used Microsoft Satori. As Satori is not open source, it's not possible to replicate the results found in the papers. The solution is using other open source KGs.\n",
"\n",
"The goal of this notebook is to provide examples of how to interact with Wikipedia queries and Wikidata to extract a Knowledge Graph that can be used with the mentioned algorithms.\n",
"\n",
Expand Down Expand Up @@ -34,19 +34,17 @@
"sys.path.append(\"../../\")\n",
"print(\"System version: {}\".format(sys.version))\n",
"\n",
"import papermill as pm\n",
"import pandas as pd\n",
"import networkx as nx\n",
"import matplotlib.pyplot as plt\n",
"from reco_utils.dataset import movielens\n",
"\n",
"from reco_utils.dataset.wikidata import (search_wikidata, \n",
" find_wikidata_id, \n",
" query_entity_links, \n",
" read_linked_entities,\n",
" query_entity_description)\n",
"\n",
"import networkx as nx\n",
"import matplotlib.pyplot as plt\n",
"from tqdm import tqdm\n",
"\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.common.notebook_utils import is_jupyter"
" query_entity_description)\n"
]
},
{
Expand Down Expand Up @@ -548,11 +546,8 @@
}
],
"source": [
"# Record results with papermill for tests - ignore this cell\n",
"if is_jupyter():\n",
" # Record results with papermill for unit-tests\n",
" import papermill as pm\n",
" pm.record(\"length_result\", number_movies)"
"# Record results with papermill for unit-tests\n",
"pm.record(\"length_result\", number_movies)"
]
},
{
Expand All @@ -566,9 +561,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python (reco_base)",
"display_name": "Python (reco_bare)",
"language": "python",
"name": "reco_base"
"name": "reco_bare"
miguelgfierro marked this conversation as resolved.
Show resolved Hide resolved
},
"language_info": {
"codemirror_mode": {
Expand Down
25 changes: 14 additions & 11 deletions reco_utils/dataset/wikidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

import pandas as pd
import requests
import logging

logger = logging.getLogger(__name__)

API_URL_WIKIPEDIA = "https://en.wikipedia.org/w/api.php"
API_URL_WIKIDATA = "https://query.wikidata.org/sparql"
Expand Down Expand Up @@ -55,11 +57,16 @@ def find_wikidata_id(name, limit=1, session=None):

try:
response = session.get(API_URL_WIKIPEDIA, params=params)
page_id = response.json()["query"]["search"][0]["pageid"]
except Exception as e:
miguelgfierro marked this conversation as resolved.
Show resolved Hide resolved
# TODO: log exception
# print(e)
logger.error("CONNECTION ERROR")
logger.error(e)
return "badRequest"

n_results = response.json()["query"]["searchinfo"]["totalhits"]
if n_results == 0:
return "entityNotFound"
else:
page_id = response.json()["query"]["search"][0]["pageid"]

params = dict(
action="query",
Expand All @@ -75,8 +82,8 @@ def find_wikidata_id(name, limit=1, session=None):
"wikibase_item"
]
except Exception as e:
# TODO: log exception
# print(e)
# TODO: distinguish between connection error and entity not found
logger.error("ENTITY NOT FOUND")
return "entityNotFound"

return entity_id
Expand Down Expand Up @@ -133,9 +140,7 @@ def query_entity_links(entity_id, session=None):
API_URL_WIKIDATA, params=dict(query=query, format="json")
).json()
except Exception as e:
# TODO log exception
# print(e)
# print("Entity ID not Found in Wikidata")
logger.error("ENTITY NOT FOUND")
return {}

return data
Expand Down Expand Up @@ -195,9 +200,7 @@ def query_entity_description(entity_id, session=None):
r = session.get(API_URL_WIKIDATA, params=dict(query=query, format="json"))
description = r.json()["results"]["bindings"][0]["o"]["value"]
except Exception as e:
# TODO: log exception
# print(e)
# print("Description not found")
logger.error("DESCRIPTION NOT FOUND")
return "descriptionNotFound"

return description
Expand Down
21 changes: 7 additions & 14 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ def spark(app_name="Sample", url="local[*]"):
SparkSession: new Spark session
"""

config = {"spark.local.dir": "/mnt",
"spark.sql.shuffle.partitions": 1}
config = {"spark.local.dir": "/mnt", "spark.sql.shuffle.partitions": 1}
spark = start_or_get_spark(app_name=app_name, url=url, config=config)
yield spark
spark.stop()
Expand Down Expand Up @@ -185,15 +184,11 @@ def notebooks():

# Path for the notebooks
paths = {
"template": os.path.join(
folder_notebooks, "template.ipynb"
),
"template": os.path.join(folder_notebooks, "template.ipynb"),
"sar_single_node": os.path.join(
folder_notebooks, "00_quick_start", "sar_movielens.ipynb"
),
"ncf": os.path.join(
folder_notebooks, "00_quick_start", "ncf_movielens.ipynb"
),
"ncf": os.path.join(folder_notebooks, "00_quick_start", "ncf_movielens.ipynb"),
"als_pyspark": os.path.join(
folder_notebooks, "00_quick_start", "als_movielens.ipynb"
),
Expand All @@ -215,8 +210,8 @@ def notebooks():
"data_split": os.path.join(
folder_notebooks, "01_prepare_data", "data_split.ipynb"
),
"wikidata_KG": os.path.join(
folder_notebooks, "01_prepare_data", "wikidata_KG.ipynb"
"wikidata_knowledge_graph": os.path.join(
folder_notebooks, "01_prepare_data", "wikidata_knowledge_graph.ipynb"
),
"als_deep_dive": os.path.join(
folder_notebooks, "02_model", "als_deep_dive.ipynb"
Expand All @@ -239,9 +234,7 @@ def notebooks():
"mmlspark_lightgbm_criteo": os.path.join(
folder_notebooks, "02_model", "mmlspark_lightgbm_criteo.ipynb"
),
"evaluation": os.path.join(
folder_notebooks, "03_evaluate", "evaluation.ipynb"
),
"evaluation": os.path.join(folder_notebooks, "03_evaluate", "evaluation.ipynb"),
"spark_tuning": os.path.join(
folder_notebooks, "04_model_select_and_optimize", "tuning_spark_als.ipynb"
),
Expand All @@ -250,6 +243,6 @@ def notebooks():
),
"nni_tuning_svd": os.path.join(
folder_notebooks, "04_model_select_and_optimize", "nni_surprise_svd.ipynb"
)
),
}
return paths
106 changes: 58 additions & 48 deletions tests/integration/test_notebooks_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,22 @@
"size, expected_values",
[
(
"1m",
{
"map": 0.060579,
"ndcg": 0.299245,
"precision": 0.270116,
"recall": 0.104350,
},
"1m",
{
"map": 0.060579,
"ndcg": 0.299245,
"precision": 0.270116,
"recall": 0.104350,
},
),
(
"10m",
{
"map": 0.098745,
"ndcg": 0.319625,
"precision": 0.275756,
"recall": 0.154014,
},
"10m",
{
"map": 0.098745,
"ndcg": 0.319625,
"precision": 0.275756,
"recall": 0.154014,
},
),
],
)
Expand All @@ -55,13 +55,13 @@ def test_sar_single_node_integration(notebooks, size, expected_values):
"size, expected_values",
[
(
"1m",
{
"map": 0.033914,
"ndcg": 0.231570,
"precision": 0.211923,
"recall": 0.064663,
},
"1m",
{
"map": 0.033914,
"ndcg": 0.231570,
"precision": 0.211923,
"recall": 0.064663,
},
),
# ("10m", {"map": , "ndcg": , "precision": , "recall": }), # OOM on test machine
],
Expand All @@ -86,17 +86,17 @@ def test_baseline_deep_dive_integration(notebooks, size, expected_values):
"size, expected_values",
[
(
"1m",
dict(
rmse=0.89,
mae=0.70,
rsquared=0.36,
exp_var=0.36,
map=0.011,
ndcg=0.10,
precision=0.093,
recall=0.025,
),
"1m",
dict(
rmse=0.89,
mae=0.70,
rsquared=0.36,
exp_var=0.36,
map=0.011,
ndcg=0.10,
precision=0.093,
recall=0.025,
),
),
# 10m works but takes too long
],
Expand Down Expand Up @@ -153,25 +153,35 @@ def test_vw_deep_dive_integration(notebooks, size, expected_values):
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
def test_nni_tuning_svd(notebooks, tmp):
notebook_path = notebooks["nni_tuning_svd"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME,
parameters=dict(MOVIELENS_DATA_SIZE="100k",
SURPRISE_READER="ml-100k",
TMP_DIR=tmp,
MAX_TRIAL_NUM=1,
NUM_EPOCHS=1,
WAITING_TIME=20,
MAX_RETRIES=50))
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
MOVIELENS_DATA_SIZE="100k",
SURPRISE_READER="ml-100k",
TMP_DIR=tmp,
MAX_TRIAL_NUM=1,
NUM_EPOCHS=1,
WAITING_TIME=20,
MAX_RETRIES=50,
),
)


@pytest.mark.integration
def test_wikidata_integration(notebooks, tmp):
notebook_path = notebooks["wikidata_KG"]
sample_size = 5
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME,
parameters=dict(MOVIELENS_DATA_SIZE='100k',
MOVIELENS_SAMPLE=True,
MOVIELENS_SAMPLE_SIZE=sample_size))

notebook_path = notebooks["wikidata_knowledge_graph"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
MOVIELENS_DATA_SIZE="100k", MOVIELENS_SAMPLE=True, MOVIELENS_SAMPLE_SIZE=5
),
)

results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
assert results["length_result"] == sample_size
# FIXME: The return number should be always 5, but sometimes we get 4, find out why
assert results["length_result"] > 4
miguelgfierro marked this conversation as resolved.
Show resolved Hide resolved

38 changes: 25 additions & 13 deletions tests/unit/test_notebooks_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,24 +57,36 @@ def test_vw_deep_dive_runs(notebooks):
@pytest.mark.notebooks
def test_lightgbm(notebooks):
notebook_path = notebooks["lightgbm_quickstart"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME,
parameters=dict(MAX_LEAF=32,
MIN_DATA=20,
NUM_OF_TREES=10,
TREE_LEARNING_RATE=0.15,
EARLY_STOPPING_ROUNDS=20,
METRIC="auc"))
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
MAX_LEAF=32,
MIN_DATA=20,
NUM_OF_TREES=10,
TREE_LEARNING_RATE=0.15,
EARLY_STOPPING_ROUNDS=20,
METRIC="auc",
),
)


@pytest.mark.notebooks
def test_wikidata_runs(notebooks, tmp):
notebook_path = notebooks["wikidata_KG"]
notebook_path = notebooks["wikidata_knowledge_graph"]
MOVIELENS_SAMPLE_SIZE = 5
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME,
parameters=dict(MOVIELENS_DATA_SIZE='100k',
MOVIELENS_SAMPLE=True,
MOVIELENS_SAMPLE_SIZE=MOVIELENS_SAMPLE_SIZE))

pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
MOVIELENS_DATA_SIZE="100k",
MOVIELENS_SAMPLE=True,
MOVIELENS_SAMPLE_SIZE=MOVIELENS_SAMPLE_SIZE,
),
)


@pytest.mark.notebooks
def test_rlrmc_quickstart_runs(notebooks):
Expand Down
Loading