-
Notifications
You must be signed in to change notification settings - Fork 3
/
language_identification.schema.json
190 lines (190 loc) · 6.15 KB
/
language_identification.schema.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
{
"$schema": "http://json-schema.org/draft-06/schema#",
"$id": "https://impresso.github.io/impresso-schemas/json/language_identification/language_identification.schema.json",
"title": "Impresso Language Identification Information",
"description": "A representation for information relevant to impresso's content item language recognition. This defines the output of the following impresso NLP component. https://github.com/impresso/impresso-language-identification ",
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The unique identifier for a content item, cf. https://github.com/impresso/impresso-schemas/blob/master/json/newspaper/contentitem.schema.json"
},
"orig_lg": {
"type": [
"string",
"null"
],
"oneOf": [
{
"title": "null",
"description": "No information on language from original metadata is available.",
"type": "null"
},
{
"$ref": "#/definitions/langISO639Type"
}
],
"description": "Original language of the content item as provided by the metadata, cf. https://github.com/impresso/impresso-schemas/blob/master/json/newspaper/contentitem.schema.json"
},
"lg": {
"type": [
"string",
"null"
],
"oneOf": [
{
"title": "null",
"type": "null"
},
{
"$ref": "#/definitions/langISO639Type"
}
],
"description": "Computed language of the content item, cf. https://github.com/impresso/impresso-schemas/blob/master/json/newspaper/contentitem.schema.json"
},
"lg_decision": {
"enum": [
"all",
"all-but-impresso_ft",
"voting",
"dominant-by-len",
"dominant-by-lowvote"
],
"type": "string",
"description": "An identifier for the decision strategy applied to the content item: 'all' = all LID systems/info agree; 'all-but-impresso_ft' = all LID except impresso_ft agree on a language other than de/fr; 'dominant-by-len' = the most frequent language of the ensemble decisions is selected because there are too few characters; 'dominant-by-lowvote' = the most frequent language of the ensemble decisions is selected because there are too few votes; 'voting' = the language with the highest vote count is selected "
},
"tp": {
"type": "string",
"description": "Type of the content item, cf. https://github.com/impresso/impresso-schemas/blob/master/json/newspaper/contentitem.schema.json"
},
"len": {
"type": "integer",
"description": "Number of characters of content item",
"minimum": 0
},
"impresso_language_identifier_version": {
"$ref": "#/definitions/versionType",
"description": "Version information of used impresso language identifier script and the file creation timestamp"
},
"language_identifier_version": {
"$ref": "#/definitions/versionType",
"description": "Version information of used impresso language identifier script and the timestamp of the file used for the decision"
},
"alphabetical_ratio": {
"type": [
"number",
"null"
],
"description": "Ratio of alphabetical characters (Unicode letter class matched by \\w) w.r.t. text length of content item",
"minimum": 0,
"maximum": 1
},
"langdetect": {
"type": "array",
"items": {
"$ref": "#/definitions/lidType"
},
"description": "List of predicted languages using Langdetect, sorted by probability"
},
"langid": {
"type": "array",
"items": {
"$ref": "#/definitions/lidType"
},
"description": "List of predicted languages using Langid, sorted by probability"
},
"impresso_ft": {
"type": "array",
"items": {
"$ref": "#/definitions/lidType"
},
"description": "List of predicted languages using a FastText model trained on Impresso articles, sorted by probability"
},
"wp_ft": {
"type": "array",
"items": {
"$ref": "#/definitions/lidType"
},
"description": "List of predicted languages using a FastText model trained on Wikipedia, sorted by probability"
},
"votes": {
"type": "array",
"items": [
{
"type": "object",
"properties": {
"lang": {
"$ref": "#/definitions/langISO639Type"
},
"vote": {
"type": "number"
}
},
"required": [
"lang",
"vote"
]
}
]
}
},
"required": [
"tp",
"id",
"len",
"lg",
"orig_lg"
],
"definitions": {
"lidType": {
"title": "lidType",
"description": "Predicted language and its probability",
"type": "object",
"properties": {
"lang": {
"$ref": "#/definitions/langISO639Type"
},
"prob": {
"$ref": "#/definitions/probType"
}
},
"required": [
"lang",
"prob"
]
},
"langISO639Type": {
"type": "string",
"title": "langISO639Type",
"description": "Two- or three-letter ISO language abbreviation, cf. https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes or https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes",
"pattern": "^[a-z]{2,3}$"
},
"probType": {
"type": "number",
"title": "probType",
"description": "Probability of the prediction",
"minimum": 0,
"maximum": 1
},
"versionType": {
"title": "versionType",
"description": "Administrative information on tool version and data creation time.",
"type": "object",
"properties": {
"versionType": {
"description": "Version of tool. Either date or output of command 'git describe'",
"type": "string"
},
"ts": {
"description": "timestamp of creation of the JSON file (e.g. '2018-09-18T08:00:08+00:00')",
"type": "string",
"pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(Z|\\+00:00)$"
}
},
"required": [
"ts",
"version"
]
}
}
}