-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
RISCBAC.py
104 lines (88 loc) · 3.55 KB
/
RISCBAC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""RISCBAC: a synthetic bilingual automotive insurance contract dataset"""
import json
import os
import datasets
_CITATION = """\
@misc{beaucheminrisc,
title={{RISC: Generating Realistic Synthetic Bilingual Insurance
Contract}},
author={David Beauchemin and Richard Khoury},
year={2023},
eprint={2304.04212},
archivePrefix={arXiv}
}
"""
# You can copy an official description
_DESCRIPTION = """\
RISCBAC was created using [RISC](https://github.com/GRAAL-Research/risc), an open-source Python package data
generator. RISC generates look-alike automobile insurance contracts based on the Quebec regulatory insurance
form in French and English.
It contains 10,000 English and French insurance contracts generated using the same seed. Thus, contracts share
the same deterministic synthetic data (RISCBAC can be used as an aligned dataset). RISC can be used to generate
more data for RISCBAC.
"""
_HOMEPAGE = "https://huggingface.co/datasets/davebulaval/RISCBAC"
_LICENSE = "Attribution 4.0 International (CC BY 4.0)"
_URL = "https://graal.ift.ulaval.ca/public/deepparse/riscbac.zip"
class RISCBAC(datasets.GeneratorBasedBuilder):
"""RISCBAC: a synthetic bilingual automotive insurance contract dataset"""
VERSION = datasets.Version("1.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="en", version=VERSION, description="This part of the dataset are automobile contract in English."
),
datasets.BuilderConfig(
name="fr", version=VERSION, description="This part of the dataset are automobile contract in French."
),
]
DEFAULT_CONFIG_NAME = "fr"
def _info(self):
features = datasets.Features(
{
"text": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
supervised_keys=None,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
data_dir = dl_manager.download_and_extract(_URL)
if self.config.name == "en":
return [
datasets.SplitGenerator(
name="full_en",
gen_kwargs={"filepath": os.path.join(data_dir, "en.jsonl")},
)
]
elif self.config.name == "fr":
return [
datasets.SplitGenerator(
name="full_fr",
gen_kwargs={"filepath": os.path.join(data_dir, "fr.jsonl")},
),
]
else:
raise ValueError(f"The config name {self.config.name} is not supported. Please use " "'en' or 'fr'.")
def _generate_examples(self, filepath):
with open(filepath, "r", encoding="utf-8") as f:
for key, line in enumerate(f):
d = json.loads(line)
yield key, d