update llm testing

PaddlePaddle · Sep 20, 2023 · 5e2c23a · 5e2c23a
1 parent 7311f09
commit 5e2c23a
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 90 deletions.
diff --git a/tests/llm/test_finetune.py b/tests/llm/test_finetune.py
@@ -13,15 +13,11 @@
 # limitations under the License.
 from __future__ import annotations
 
-import os
-import shutil
 import sys
-import tempfile
 import unittest
 
 from parameterized import parameterized_class
 
-from paddlenlp.utils.downloader import get_path_from_url
 from tests.testing_utils import argv_context_guard, load_test_config
 
 from .testing_utils import LLMTest
@@ -43,36 +39,12 @@ class FinetuneTest(LLMTest, unittest.TestCase):
     def setUp(self) -> None:
         LLMTest.setUp(self)
 
-        self.data_dir = tempfile.mkdtemp()
         sys.path.insert(0, self.model_dir)
 
-        # Run pretrain
-        URL = "https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz"
-        get_path_from_url(URL, root_dir=self.data_dir)
-        self.data_dir = os.path.join(self.data_dir, "data")
-        self.use_small_datasets()
-
-    def use_small_datasets(self):
-        # use 20 examples
-        def use_few_examples(file):
-            with open(os.path.join(self.data_dir, file), "r", encoding="utf8") as f:
-                lines = [line.strip() for line in f.readlines()]
-            with open(os.path.join(self.data_dir, file), "w+", encoding="utf8") as f:
-                f.write("\n".join(lines[:20]))
-
-        shutil.copyfile(
-            os.path.join(self.data_dir, "dev.json"),
-            os.path.join(self.data_dir, "validation.json"),
-        )
-        use_few_examples("train.json")
-        use_few_examples("dev.json")
-        use_few_examples("validation.json")
-
     def tearDown(self) -> None:
         LLMTest.tearDown(self)
-        shutil.rmtree(self.data_dir)
 
-    def test_pretrain(self):
+    def test_finetune(self):
         finetune_config = load_test_config(self.config_path, "finetune", self.model_dir)
 
         finetune_config["dataset_name_or_path"] = self.data_dir

diff --git a/tests/llm/test_lora.py b/tests/llm/test_lora.py
@@ -14,15 +14,12 @@
 from __future__ import annotations
 
 import os
-import shutil
 import sys
-import tempfile
 import unittest
 
 import paddle
 from parameterized import parameterized_class
 
-from paddlenlp.utils.downloader import get_path_from_url
 from tests.testing_utils import argv_context_guard, load_test_config
 
 from .testing_utils import LLMTest
@@ -45,35 +42,11 @@ class LoraTest(LLMTest, unittest.TestCase):
     def setUp(self) -> None:
         LLMTest.setUp(self)
 
-        self.data_dir = tempfile.mkdtemp()
         self.model_codes_dir = os.path.join(self.root_path, self.model_dir)
         sys.path.insert(0, self.model_codes_dir)
 
-        # Run pretrain
-        URL = "https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz"
-        get_path_from_url(URL, root_dir=self.data_dir)
-        self.data_dir = os.path.join(self.data_dir, "data")
-        self.use_small_datasets()
-
-    def use_small_datasets(self):
-        # use 20 examples
-        def use_few_examples(file):
-            with open(os.path.join(self.data_dir, file), "r", encoding="utf8") as f:
-                lines = [line.strip() for line in f.readlines()]
-            with open(os.path.join(self.data_dir, file), "w+", encoding="utf8") as f:
-                f.write("\n".join(lines[:20]))
-
-        shutil.copyfile(
-            os.path.join(self.data_dir, "dev.json"),
-            os.path.join(self.data_dir, "validation.json"),
-        )
-        use_few_examples("train.json")
-        use_few_examples("dev.json")
-        use_few_examples("validation.json")
-
     def tearDown(self) -> None:
         LLMTest.tearDown(self)
-        shutil.rmtree(self.data_dir)
         sys.path.remove(self.model_codes_dir)
 
     def test_lora(self):

diff --git a/tests/llm/test_prefix_tuning.py b/tests/llm/test_prefix_tuning.py
@@ -14,14 +14,11 @@
 from __future__ import annotations
 
 import os
-import shutil
 import sys
-import tempfile
 import unittest
 
 from parameterized import parameterized_class
 
-from paddlenlp.utils.downloader import get_path_from_url
 from tests.testing_utils import argv_context_guard, load_test_config
 
 from .testing_utils import LLMTest
@@ -40,39 +37,14 @@ class LoraTest(LLMTest, unittest.TestCase):
     def setUp(self) -> None:
         LLMTest.setUp(self)
 
-        self.data_dir = tempfile.mkdtemp()
         self.model_codes_dir = os.path.join(self.root_path, self.model_dir)
         sys.path.insert(0, self.model_codes_dir)
 
-        # Run pretrain
-        URL = "https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz"
-        get_path_from_url(URL, root_dir=self.data_dir)
-        self.data_dir = os.path.join(self.data_dir, "data")
-        self.use_small_datasets()
-
-    def use_small_datasets(self):
-        # use 20 examples
-        def use_few_examples(file):
-            with open(os.path.join(self.data_dir, file), "r", encoding="utf8") as f:
-                lines = [line.strip() for line in f.readlines()]
-            with open(os.path.join(self.data_dir, file), "w+", encoding="utf8") as f:
-                f.write("\n".join(lines[:20]))
-
-        shutil.copyfile(
-            os.path.join(self.data_dir, "dev.json"),
-            os.path.join(self.data_dir, "validation.json"),
-        )
-        use_few_examples("train.json")
-        use_few_examples("dev.json")
-        use_few_examples("validation.json")
-
     def tearDown(self) -> None:
         LLMTest.tearDown(self)
-        shutil.rmtree(self.data_dir)
-
         sys.path.remove(self.model_codes_dir)
 
-    def test_pretrain(self):
+    def test_prefix_tuning(self):
         prefix_tuning_config = load_test_config(self.config_path, "prefix_tuning", self.model_dir)
 
         prefix_tuning_config["dataset_name_or_path"] = self.data_dir

diff --git a/tests/llm/test_pretrain.py b/tests/llm/test_pretrain.py
@@ -40,26 +40,26 @@ class PretrainTest(LLMTest, unittest.TestCase):
     def setUp(self) -> None:
         LLMTest.setUp(self)
 
-        self.data_dir = tempfile.mkdtemp()
+        self.dataset_dir = tempfile.mkdtemp()
         self.model_codes_dir = os.path.join(self.root_path, self.model_dir)
         sys.path.insert(0, self.model_codes_dir)
 
     def tearDown(self) -> None:
         LLMTest.tearDown(self)
 
         sys.path.remove(self.model_codes_dir)
-        shutil.rmtree(self.data_dir)
+        shutil.rmtree(self.dataset_dir)
 
     def test_pretrain(self):
         # Run pretrain
         URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy"
         URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz"
-        get_path_from_url(URL, root_dir=self.data_dir)
-        get_path_from_url(URL2, root_dir=self.data_dir)
+        get_path_from_url(URL, root_dir=self.dataset_dir)
+        get_path_from_url(URL2, root_dir=self.dataset_dir)
 
         pretrain_config = load_test_config(self.config_path, "pretrain", self.model_dir)
 
-        pretrain_config["input_dir"] = self.data_dir
+        pretrain_config["input_dir"] = self.dataset_dir
         pretrain_config["output_dir"] = self.output_dir
 
         with argv_context_guard(pretrain_config):

diff --git a/tests/llm/testing_utils.py b/tests/llm/testing_utils.py
@@ -27,6 +27,7 @@
 class LLMTest:
     config_path: str = None
     enable_compare: bool = True
+    data_dir = "./tests/fixtures/llm/data/"
 
     def setUp(self) -> None:
         self.root_path = "./llm"