Add synthetic tasks from GPT-3 paper.

PiperOrigin-RevId: 329329055
tensorflow · Aug 31, 2020 · 88a8333 · 88a8333
1 parent 6c36f15
commit 88a8333
Show file tree

Hide file tree

Showing 7 changed files with 162 additions and 0 deletions.
diff --git a/...ets/testing/test_data/fake_examples/gpt3/gpt-3-master/data/cycle_letters_in_word.jsonl.gz b/...ets/testing/test_data/fake_examples/gpt3/gpt-3-master/data/cycle_letters_in_word.jsonl.gz
diff --git a/...rflow_datasets/testing/test_data/fake_examples/gpt3/gpt-3-master/data/sum_of_digits.jsonl b/...rflow_datasets/testing/test_data/fake_examples/gpt3/gpt-3-master/data/sum_of_digits.jsonl
@@ -0,0 +1,2 @@
+{"context": "\n\nQ: What is the sum of the digits of the number 8606?\n\nA:", "completion": " 20"}
+{"context": "\n\nQ: What is the sum of the digits of the number 3085?\n\nA:", "completion": " 16"}
diff --git a/..._datasets/testing/test_data/fake_examples/gpt3/gpt-3-master/data/two_digit_addition.jsonl b/..._datasets/testing/test_data/fake_examples/gpt3/gpt-3-master/data/two_digit_addition.jsonl
@@ -0,0 +1,2 @@
+{"context": "\n\nQ: What is 98 plus 45?\n\nA:", "completion": " 143"}
+{"context": "\n\nQ: What is 95 plus 58?\n\nA:", "completion": " 153"}
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -31,6 +31,7 @@
 from tensorflow_datasets.text.gap import Gap
 from tensorflow_datasets.text.glue import Glue
 from tensorflow_datasets.text.goemotions import Goemotions
+from tensorflow_datasets.text.gpt3 import Gpt3
 from tensorflow_datasets.text.imdb import IMDBReviews
 from tensorflow_datasets.text.imdb import IMDBReviewsConfig
 from tensorflow_datasets.text.irc_disentanglement import IrcDisentanglement

diff --git a/tensorflow_datasets/text/gpt3.py b/tensorflow_datasets/text/gpt3.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT-3 few-shot evaluation dataset."""
+
+import gzip
+import json
+import os
+import uuid
+import tensorflow.compat.v2 as tf
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """
+@article{brown2020language,
+    title={Language Models are Few-Shot Learners},
+    author={Tom B. Brown et. al.}
+    year={2020},
+    eprint={2005.14165},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """
+Synthetic datasets for word scramble and arithmetic tasks described in the GPT3 paper.
+"""
+
+_DATA_URL = "https://github.com/openai/gpt-3/archive/master.zip"
+
+_MODULES = [
+    "cycle_letters_in_word",
+    "five_digit_addition",
+    "five_digit_subtraction",
+    "four_digit_addition",
+    "four_digit_subtraction",
+    "mid_word_1_anagrams",
+    "mid_word_2_anagrams",
+    "random_insertion_in_word",
+    "reversed_words",
+    "single_digit_three_ops",
+    "six_digit_addition",
+    "six_digit_subtraction",
+    "sum_of_digits",
+    "three_digit_addition",
+    "three_digit_subtraction",
+    "two_digit_addition",
+    "two_digit_multiplication",
+    "two_digit_subtraction",
+]
+
+
+def _is_gzip_file(task):
+  return "word" in task
+
+
+def _make_builder_config(module):
+  return tfds.core.BuilderConfig(
+      name=module,
+      version=tfds.core.Version("1.0.0"),
+      description=_DESCRIPTION,
+  )
+
+
+class Gpt3(tfds.core.GeneratorBasedBuilder):
+  """GPT-3 Dataset."""
+
+  BUILDER_CONFIGS = [_make_builder_config(module) for module in _MODULES]
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            "context": tfds.features.Text(),
+            "completion": tfds.features.Text(),
+        }),
+        homepage="https://github.com/openai/gpt-3",
+        supervised_keys=("context", "completion"),
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerator."""
+
+    data = tfds.download.Resource(
+        url=_DATA_URL, extract_method=tfds.download.ExtractMethod.ZIP)
+    directory = dl_manager.download_and_extract(data)
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            gen_kwargs={
+                "directory": directory,
+                "task": self.builder_config.name,
+            }),
+    ]
+
+  def _generate_examples(self, directory, task):
+    """Yields examples based on directory, task name."""
+
+    path = os.path.join(directory, "gpt-3-master", "data", task + ".jsonl")
+
+    if _is_gzip_file(task):
+      path += ".gz"
+
+    with tf.io.gfile.GFile(path, "rb") as f:
+      if _is_gzip_file(task):
+        f = gzip.GzipFile(fileobj=f)
+
+      for line in f:
+        yield uuid.uuid4().hex, json.loads(line)
diff --git a/tensorflow_datasets/text/gpt3_test.py b/tensorflow_datasets/text/gpt3_test.py
@@ -0,0 +1,33 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for GPT3 dataset module."""
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.text import gpt3
+
+
+class Gpt3Test(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = gpt3.Gpt3
+  BUILDER_CONFIG_NAMES_TO_TEST = [
+      "cycle_letters_in_word", "sum_of_digits", "two_digit_addition"
+  ]
+  SPLITS = {
+      "test": 2,  # Number of fake test example
+  }
+
+
+if __name__ == "__main__":
+  testing.test_main()
diff --git a/tensorflow_datasets/url_checksums/gpt3.txt b/tensorflow_datasets/url_checksums/gpt3.txt
@@ -0,0 +1 @@
+https://github.com/openai/gpt-3/archive/master.zip 2259409 a200cd633fb51190c61c6ad128c31e7ef7d03a00ab03baf6feff300b0fbdedab