Skip to content

Commit

Permalink
update llm testing
Browse files Browse the repository at this point in the history
  • Loading branch information
wj-Mcat committed Sep 20, 2023
1 parent 7311f09 commit 5e2c23a
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 90 deletions.
30 changes: 1 addition & 29 deletions tests/llm/test_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,11 @@
# limitations under the License.
from __future__ import annotations

import os
import shutil
import sys
import tempfile
import unittest

from parameterized import parameterized_class

from paddlenlp.utils.downloader import get_path_from_url
from tests.testing_utils import argv_context_guard, load_test_config

from .testing_utils import LLMTest
Expand All @@ -43,36 +39,12 @@ class FinetuneTest(LLMTest, unittest.TestCase):
def setUp(self) -> None:
LLMTest.setUp(self)

self.data_dir = tempfile.mkdtemp()
sys.path.insert(0, self.model_dir)

# Run pretrain
URL = "https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz"
get_path_from_url(URL, root_dir=self.data_dir)
self.data_dir = os.path.join(self.data_dir, "data")
self.use_small_datasets()

def use_small_datasets(self):
# use 20 examples
def use_few_examples(file):
with open(os.path.join(self.data_dir, file), "r", encoding="utf8") as f:
lines = [line.strip() for line in f.readlines()]
with open(os.path.join(self.data_dir, file), "w+", encoding="utf8") as f:
f.write("\n".join(lines[:20]))

shutil.copyfile(
os.path.join(self.data_dir, "dev.json"),
os.path.join(self.data_dir, "validation.json"),
)
use_few_examples("train.json")
use_few_examples("dev.json")
use_few_examples("validation.json")

def tearDown(self) -> None:
LLMTest.tearDown(self)
shutil.rmtree(self.data_dir)

def test_pretrain(self):
def test_finetune(self):
finetune_config = load_test_config(self.config_path, "finetune", self.model_dir)

finetune_config["dataset_name_or_path"] = self.data_dir
Expand Down
27 changes: 0 additions & 27 deletions tests/llm/test_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,12 @@
from __future__ import annotations

import os
import shutil
import sys
import tempfile
import unittest

import paddle
from parameterized import parameterized_class

from paddlenlp.utils.downloader import get_path_from_url
from tests.testing_utils import argv_context_guard, load_test_config

from .testing_utils import LLMTest
Expand All @@ -45,35 +42,11 @@ class LoraTest(LLMTest, unittest.TestCase):
def setUp(self) -> None:
LLMTest.setUp(self)

self.data_dir = tempfile.mkdtemp()
self.model_codes_dir = os.path.join(self.root_path, self.model_dir)
sys.path.insert(0, self.model_codes_dir)

# Run pretrain
URL = "https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz"
get_path_from_url(URL, root_dir=self.data_dir)
self.data_dir = os.path.join(self.data_dir, "data")
self.use_small_datasets()

def use_small_datasets(self):
# use 20 examples
def use_few_examples(file):
with open(os.path.join(self.data_dir, file), "r", encoding="utf8") as f:
lines = [line.strip() for line in f.readlines()]
with open(os.path.join(self.data_dir, file), "w+", encoding="utf8") as f:
f.write("\n".join(lines[:20]))

shutil.copyfile(
os.path.join(self.data_dir, "dev.json"),
os.path.join(self.data_dir, "validation.json"),
)
use_few_examples("train.json")
use_few_examples("dev.json")
use_few_examples("validation.json")

def tearDown(self) -> None:
LLMTest.tearDown(self)
shutil.rmtree(self.data_dir)
sys.path.remove(self.model_codes_dir)

def test_lora(self):
Expand Down
30 changes: 1 addition & 29 deletions tests/llm/test_prefix_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,11 @@
from __future__ import annotations

import os
import shutil
import sys
import tempfile
import unittest

from parameterized import parameterized_class

from paddlenlp.utils.downloader import get_path_from_url
from tests.testing_utils import argv_context_guard, load_test_config

from .testing_utils import LLMTest
Expand All @@ -40,39 +37,14 @@ class LoraTest(LLMTest, unittest.TestCase):
def setUp(self) -> None:
LLMTest.setUp(self)

self.data_dir = tempfile.mkdtemp()
self.model_codes_dir = os.path.join(self.root_path, self.model_dir)
sys.path.insert(0, self.model_codes_dir)

# Run pretrain
URL = "https://bj.bcebos.com/paddlenlp/datasets/examples/AdvertiseGen.tar.gz"
get_path_from_url(URL, root_dir=self.data_dir)
self.data_dir = os.path.join(self.data_dir, "data")
self.use_small_datasets()

def use_small_datasets(self):
# use 20 examples
def use_few_examples(file):
with open(os.path.join(self.data_dir, file), "r", encoding="utf8") as f:
lines = [line.strip() for line in f.readlines()]
with open(os.path.join(self.data_dir, file), "w+", encoding="utf8") as f:
f.write("\n".join(lines[:20]))

shutil.copyfile(
os.path.join(self.data_dir, "dev.json"),
os.path.join(self.data_dir, "validation.json"),
)
use_few_examples("train.json")
use_few_examples("dev.json")
use_few_examples("validation.json")

def tearDown(self) -> None:
LLMTest.tearDown(self)
shutil.rmtree(self.data_dir)

sys.path.remove(self.model_codes_dir)

def test_pretrain(self):
def test_prefix_tuning(self):
prefix_tuning_config = load_test_config(self.config_path, "prefix_tuning", self.model_dir)

prefix_tuning_config["dataset_name_or_path"] = self.data_dir
Expand Down
10 changes: 5 additions & 5 deletions tests/llm/test_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,26 +40,26 @@ class PretrainTest(LLMTest, unittest.TestCase):
def setUp(self) -> None:
LLMTest.setUp(self)

self.data_dir = tempfile.mkdtemp()
self.dataset_dir = tempfile.mkdtemp()
self.model_codes_dir = os.path.join(self.root_path, self.model_dir)
sys.path.insert(0, self.model_codes_dir)

def tearDown(self) -> None:
LLMTest.tearDown(self)

sys.path.remove(self.model_codes_dir)
shutil.rmtree(self.data_dir)
shutil.rmtree(self.dataset_dir)

def test_pretrain(self):
# Run pretrain
URL = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy"
URL2 = "https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz"
get_path_from_url(URL, root_dir=self.data_dir)
get_path_from_url(URL2, root_dir=self.data_dir)
get_path_from_url(URL, root_dir=self.dataset_dir)
get_path_from_url(URL2, root_dir=self.dataset_dir)

pretrain_config = load_test_config(self.config_path, "pretrain", self.model_dir)

pretrain_config["input_dir"] = self.data_dir
pretrain_config["input_dir"] = self.dataset_dir
pretrain_config["output_dir"] = self.output_dir

with argv_context_guard(pretrain_config):
Expand Down
1 change: 1 addition & 0 deletions tests/llm/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
class LLMTest:
config_path: str = None
enable_compare: bool = True
data_dir = "./tests/fixtures/llm/data/"

def setUp(self) -> None:
self.root_path = "./llm"
Expand Down

0 comments on commit 5e2c23a

Please sign in to comment.