Skip to content

Commit

Permalink
Statistics is missing multi version of moviegraphbenchmark (#40)
Browse files Browse the repository at this point in the history
* Add MGB multi setting to statistics creation

* Move statistics function, adapt and update csv

* Adapt import and README

* Move import again
  • Loading branch information
dobraczka authored Apr 8, 2024
1 parent 1f57f5d commit ba2a1cd
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 64 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Datasets
More broad statistics are provided in `dataset_statistics.csv`. You can also get a pandas DataFrame with statistics for specific datasets for example to create tables for publications:
```
>>> ds = MovieGraphBenchmark(graph_pair="multi")
>>> from sylloge.base import create_statistics_df
>>> from sylloge.create_statistic import create_statistics_df
>>> stats_df = create_statistics_df([ds])
>>> stats_df.loc[("MovieGraphBenchmark","moviegraphbenchmark_multi","imdb")]
Entities Relation Triples Attribute Triples ... Clusters Intra-dataset Matches All Matches
Expand Down
7 changes: 5 additions & 2 deletions dataset_statistics.csv
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,11 @@ MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,imdb,5129,17507,20800,3,13,608
MovieGraphBenchmark,moviegraphbenchmark_imdb_tvdb,tvdb,7814,15455,20902,3,9,7683,1483,22663,25583
MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tmdb,6061,27903,23761,4,30,9991,1920,64,26138
MovieGraphBenchmark,moviegraphbenchmark_tmdb_tvdb,tvdb,7814,15455,20902,3,9,7683,1920,22663,26138
MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,8885,0,5619
MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,8885,0,5619
MovieGraphBenchmark,moviegraphbenchmark_multi,imdb,5129,17507,20800,3,13,6082,3598,1,31230
MovieGraphBenchmark,moviegraphbenchmark_multi,tmdb,6061,27903,23761,4,30,9991,3598,64,31230
MovieGraphBenchmark,moviegraphbenchmark_multi,tvdb,7814,15455,20902,3,9,7683,3598,22663,31230
MED_BBK,med_bbk,MED,9162,158357,11467,32,19,10858,9162,0,9162
MED_BBK,med_bbk,BBK,9162,50307,44987,20,21,36608,9162,0,9162
OAEI,oaei_marvelcinematicuniverse_marvel,marvelcinematicuniverse,216033,1094598,130517,130,110,56566,1654,0,1654
OAEI,oaei_marvelcinematicuniverse_marvel,marvel,1472619,5152898,1580468,63,127,749980,1654,0,1654
OAEI,oaei_memoryalpha_memorybeta,memoryalpha,254537,2096198,430730,180,287,226110,9296,0,9296
Expand Down
2 changes: 0 additions & 2 deletions sylloge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
TrainTestValSplit,
ZipEADataset,
ZipEADatasetWithPreSplitFolds,
create_statistics_df,
)
from .id_mapped import IdMappedEADataset
from .med_bbk_loader import MED_BBK
Expand All @@ -35,7 +34,6 @@
"BinaryZipEADatasetWithPreSplitFolds",
"ZipEADatasetWithPreSplitFolds",
"TrainTestValSplit",
"create_statistics_df",
]
__version__ = version(__package__)
logging.getLogger(__name__).setLevel(logging.INFO)
56 changes: 0 additions & 56 deletions sylloge/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
Callable,
Dict,
Generic,
Iterable,
List,
Literal,
Mapping,
Expand Down Expand Up @@ -1085,58 +1084,3 @@ class BinaryZipEADatasetWithPreSplitFolds(

def __repr__(self) -> str:
return self._binary_repr_adjustment(super().__repr__())


def create_statistics_df(
datasets: Iterable[MultiSourceEADataset], seperate_attribute_relations: bool = True
):
rows = []
triples_col = (
["Relation Triples", "Attribute Triples"]
if seperate_attribute_relations
else ["Triples"]
)
index_cols = ["Dataset family", "Task Name", "Dataset Name"]
columns = [
*index_cols,
"Entities",
*triples_col,
"Relations",
"Properties",
"Literals",
"Clusters",
"Intra-dataset Matches",
"All Matches",
]
for ds in datasets:
ds_family = str(ds.__class__.__name__).split(".")[-1]
ds_stats, num_clusters = ds.statistics()
all_matches = ds.ent_links.number_of_links
intra_dataset_matches = (0,) * len(ds.dataset_names)
if isinstance(ds.ent_links, PrefixedClusterHelper):
intra_dataset_matches = ds.ent_links.number_of_intra_links
for i, (ds_side, ds_side_name) in enumerate(zip(ds_stats, ds.dataset_names)):
if seperate_attribute_relations:
triples = [ds_side.rel_triples, ds_side.attr_triples]
else:
triples = [ds_side.triples]
rows.append(
[
ds_family,
ds.canonical_name,
ds_side_name,
ds_side.entities,
*triples,
ds_side.relations,
ds_side.properties,
ds_side.literals,
num_clusters,
intra_dataset_matches[i],
all_matches,
]
)
statistics_df = pd.DataFrame(
rows,
columns=columns,
)
return statistics_df.set_index(index_cols)
66 changes: 63 additions & 3 deletions sylloge/create_statistic.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,68 @@
from typing import Dict, Iterable, Tuple

import pandas as pd
from eche import ClusterHelper, PrefixedClusterHelper

from sylloge import MED_BBK, OAEI, MovieGraphBenchmark, MultiSourceEADataset, OpenEA
from sylloge.base import create_statistics_df


def create_statistics_df(
datasets: Iterable[MultiSourceEADataset], seperate_attribute_relations: bool = True
):
rows = []
triples_col = (
["Relation Triples", "Attribute Triples"]
if seperate_attribute_relations
else ["Triples"]
)
index_cols = ["Dataset family", "Task Name", "Dataset Name"]
columns = [
*index_cols,
"Entities",
*triples_col,
"Relations",
"Properties",
"Literals",
"Clusters",
"Intra-dataset Matches",
"All Matches",
]
for ds in datasets:
ds_family = str(ds.__class__.__name__).split(".")[-1]
ds_stats, num_clusters = ds.statistics()
intra_dataset_matches = (0,) * len(ds.dataset_names)
if isinstance(ds.ent_links, ClusterHelper):
all_matches = ds.ent_links.number_of_links
if isinstance(ds.ent_links, PrefixedClusterHelper):
intra_dataset_matches = ds.ent_links.number_of_intra_links
else:
all_matches = len(ds.ent_links)
for i, (ds_side, ds_side_name) in enumerate(zip(ds_stats, ds.dataset_names)):
if seperate_attribute_relations:
triples = [ds_side.rel_triples, ds_side.attr_triples]
else:
triples = [ds_side.triples]
rows.append(
[
ds_family,
ds.canonical_name,
ds_side_name,
ds_side.entities,
*triples,
ds_side.relations,
ds_side.properties,
ds_side.literals,
num_clusters,
intra_dataset_matches[i],
all_matches,
]
)
statistics_df = pd.DataFrame(
rows,
columns=columns,
)
return statistics_df.set_index(index_cols)


all_classes_with_args: Tuple[Tuple[type[MultiSourceEADataset], Dict[str, str]], ...] = (
(OpenEA, {"graph_pair": "D_W", "size": "15K", "version": "V1"}),
Expand All @@ -25,6 +84,7 @@
(MovieGraphBenchmark, {"graph_pair": "imdb-tmdb"}),
(MovieGraphBenchmark, {"graph_pair": "imdb-tvdb"}),
(MovieGraphBenchmark, {"graph_pair": "tmdb-tvdb"}),
(MovieGraphBenchmark, {"graph_pair": "multi"}),
(MED_BBK, {}),
(OAEI, {"task": "marvelcinematicuniverse-marvel"}),
(OAEI, {"task": "memoryalpha-memorybeta"}),
Expand All @@ -34,7 +94,7 @@
)


def create_statistic(
def create_and_write_statistic(
classes_with_args: Iterable[
Tuple[type[MultiSourceEADataset], Dict[str, str]]
] = all_classes_with_args,
Expand All @@ -49,4 +109,4 @@ def create_statistic(


if __name__ == "__main__":
create_statistic()
create_and_write_statistic()

0 comments on commit ba2a1cd

Please sign in to comment.