Skip to content

Commit

Permalink
Add synthetic tasks from GPT-3 paper.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 329329055
  • Loading branch information
TensorFlow Datasets Team authored and copybara-github committed Aug 31, 2020
1 parent 6c36f15 commit 88a8333
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 0 deletions.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"context": "\n\nQ: What is the sum of the digits of the number 8606?\n\nA:", "completion": " 20"}
{"context": "\n\nQ: What is the sum of the digits of the number 3085?\n\nA:", "completion": " 16"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"context": "\n\nQ: What is 98 plus 45?\n\nA:", "completion": " 143"}
{"context": "\n\nQ: What is 95 plus 58?\n\nA:", "completion": " 153"}
1 change: 1 addition & 0 deletions tensorflow_datasets/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from tensorflow_datasets.text.gap import Gap
from tensorflow_datasets.text.glue import Glue
from tensorflow_datasets.text.goemotions import Goemotions
from tensorflow_datasets.text.gpt3 import Gpt3
from tensorflow_datasets.text.imdb import IMDBReviews
from tensorflow_datasets.text.imdb import IMDBReviewsConfig
from tensorflow_datasets.text.irc_disentanglement import IrcDisentanglement
Expand Down
123 changes: 123 additions & 0 deletions tensorflow_datasets/text/gpt3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""GPT-3 few-shot evaluation dataset."""

import gzip
import json
import os
import uuid
import tensorflow.compat.v2 as tf
import tensorflow_datasets.public_api as tfds

_CITATION = """
@article{brown2020language,
title={Language Models are Few-Shot Learners},
author={Tom B. Brown et. al.}
year={2020},
eprint={2005.14165},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""

_DESCRIPTION = """
Synthetic datasets for word scramble and arithmetic tasks described in the GPT3 paper.
"""

_DATA_URL = "https://github.com/openai/gpt-3/archive/master.zip"

_MODULES = [
"cycle_letters_in_word",
"five_digit_addition",
"five_digit_subtraction",
"four_digit_addition",
"four_digit_subtraction",
"mid_word_1_anagrams",
"mid_word_2_anagrams",
"random_insertion_in_word",
"reversed_words",
"single_digit_three_ops",
"six_digit_addition",
"six_digit_subtraction",
"sum_of_digits",
"three_digit_addition",
"three_digit_subtraction",
"two_digit_addition",
"two_digit_multiplication",
"two_digit_subtraction",
]


def _is_gzip_file(task):
return "word" in task


def _make_builder_config(module):
return tfds.core.BuilderConfig(
name=module,
version=tfds.core.Version("1.0.0"),
description=_DESCRIPTION,
)


class Gpt3(tfds.core.GeneratorBasedBuilder):
"""GPT-3 Dataset."""

BUILDER_CONFIGS = [_make_builder_config(module) for module in _MODULES]

def _info(self):
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict({
"context": tfds.features.Text(),
"completion": tfds.features.Text(),
}),
homepage="https://github.com/openai/gpt-3",
supervised_keys=("context", "completion"),
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerator."""

data = tfds.download.Resource(
url=_DATA_URL, extract_method=tfds.download.ExtractMethod.ZIP)
directory = dl_manager.download_and_extract(data)

return [
tfds.core.SplitGenerator(
name=tfds.Split.TEST,
gen_kwargs={
"directory": directory,
"task": self.builder_config.name,
}),
]

def _generate_examples(self, directory, task):
"""Yields examples based on directory, task name."""

path = os.path.join(directory, "gpt-3-master", "data", task + ".jsonl")

if _is_gzip_file(task):
path += ".gz"

with tf.io.gfile.GFile(path, "rb") as f:
if _is_gzip_file(task):
f = gzip.GzipFile(fileobj=f)

for line in f:
yield uuid.uuid4().hex, json.loads(line)
33 changes: 33 additions & 0 deletions tensorflow_datasets/text/gpt3_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for GPT3 dataset module."""

from tensorflow_datasets import testing
from tensorflow_datasets.text import gpt3


class Gpt3Test(testing.DatasetBuilderTestCase):
DATASET_CLASS = gpt3.Gpt3
BUILDER_CONFIG_NAMES_TO_TEST = [
"cycle_letters_in_word", "sum_of_digits", "two_digit_addition"
]
SPLITS = {
"test": 2, # Number of fake test example
}


if __name__ == "__main__":
testing.test_main()
1 change: 1 addition & 0 deletions tensorflow_datasets/url_checksums/gpt3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/openai/gpt-3/archive/master.zip 2259409 a200cd633fb51190c61c6ad128c31e7ef7d03a00ab03baf6feff300b0fbdedab

0 comments on commit 88a8333

Please sign in to comment.