forked from huggingface/datatrove
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyproject.toml
123 lines (113 loc) · 2.62 KB
/
pyproject.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
[project]
name = "datatrove"
version = "0.0.1.dev0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description = "HuggingFace library to process and filter large amounts of webdata"
readme = "README.md"
authors = [
{name = "HuggingFace Inc.", email = "guilherme@huggingface.co"}
]
license = {text = "Apache-2.0"}
classifiers = [
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
keywords = ["data", "machine", "learning", "processing"]
requires-python = ">=3.10.0"
dependencies = [
"dill>=0.3.0",
"fsspec>=2023.6.0",
"huggingface-hub>=0.17.0",
"humanize",
"loguru>=0.7.0",
"multiprocess",
"numpy>=1.25.0",
"tqdm",
]
[project.optional-dependencies]
cli = [
"rich",
]
io = [
"faust-cchardet",
"pyarrow",
"python-magic",
"warcio",
"datasets"
]
s3 = [
"s3fs>=2023.12.2",
]
processing = [
"fasttext-wheel",
"nltk",
"inscriptis",
"readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup",
"tldextract",
"trafilatura",
"tokenizers",
]
quality = [
"ruff>=0.1.5"
]
testing = [
"datatrove[cli]",
"datatrove[io]",
"datatrove[processing]",
"datatrove[s3]",
"pytest",
"pytest-timeout",
"pytest-xdist",
"moto[s3,server]",
]
all = [
"datatrove[quality]",
"datatrove[testing]",
]
dev = [
"datatrove[all]"
]
[project.urls]
Repository = "https://github.com/huggingface/datatrove"
[project.scripts]
check_dataset = "datatrove.tools.check_dataset:main"
merge_stats = "datatrove.tools.merge_stats:main"
launch_pickled_pipeline = "datatrove.tools.launch_pickled_pipeline:main"
failed_logs = "datatrove.tools.failed_logs:main"
inspect_data = "datatrove.tools.inspect_data:main"
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["src"]
[tool.setuptools.package-data]
datatrove = ["assets/*"]
[tool.ruff]
ignore = [
"C901", # `function_name` is too complex
"E501", # line length violation
]
select = [
"C",
"E",
"F",
"I",
"W"
]
line-length = 119
[tool.ruff.per-file-ignores]
"__init__.py" = [
"F401" # module imported but unused
]
[tool.ruff.isort]
lines-after-imports = 2
known-first-party = [
"datatrove"
]