Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

using validated.tsv as source #104

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ __pycache__/*
.settings
.idea
tags
.vscode
a

# Package files
*.egg
Expand Down
3 changes: 3 additions & 0 deletions src/corporacreator/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def create(self):
"""
_logger.info("Creating corpora...")
corpora_data = self._parse_tsv()
# FIXME: THIS IS WRONG DONT USE VALIDATED.TSV AS A SOURCE
if not ("locale" in corpora_data.columns):
corpora_data["locale"] = [ self.args.langs[0] ] * len(corpora_data.index)
corpora_data[["sentence", "up_votes", "down_votes"]] = corpora_data[
["sentence", "up_votes", "down_votes"]
].swifter.apply(func=lambda arg: common_wrapper(*arg), axis=1)
Expand Down
111 changes: 111 additions & 0 deletions src/corporacreator/preprocessors/kab.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,113 @@
# created by Mestafa Kamal

import string

"""
Keep Upper case
Keep Punctuation

Replace wrong characters
Remove bad spaces

Strip
Unvalidate sentences containing not allowed characters
"""

allowed = list(string.ascii_lowercase)
allowed.extend(list("ẓṛṭɛṣḍǧḥɣč"))

majuscule = []

for i in allowed:
majuscule.append(i.upper())

allowed.extend(list(majuscule))

replacer = {
"Ğ": "Ǧ",
"ğ": "ǧ",
"Γ": "Ɣ",
"γ": "ɣ",
"Σ": "Ɛ",
"Ԑ": "Ɛ",
"εσ": "ɛ",
"«»“”": "\"",
}

punctuation = [
" ",
"-",
".",
"?",
",",
"!",
";",
"_",
":",
"/",
"(",
")",
"{",
"}",
"[",
"]",
"\"",
]

replacements = {}

for all, replacement in replacer.items():
for to_replace in all:
replacements[to_replace] = replacement

def remplaceSymbols(word):
result = word
for to_replace, replacement in replacements.items():
result = result.replace(to_replace, replacement)
return result

def removeBadSpace(sentence):
sentence = sentence.replace(" -", "-")
sentence = sentence.replace("- ", "-")
return sentence

def replaceTs(word):
if word.endswith("ţţ"):
word = word[0:-2] + "t"
elif word.endswith("-ţ"):
word = word[0:-2] + "-tt"
elif word.endswith("ţ"):
word = word[0:-1] + "t"
word = word.replace("ţţ", "tt")
word = word.replace("ţ", "tt")
return word

def checkSentence (sentence):
for i in sentence:
if i not in allowed and i not in punctuation:
return False
return True

def cleanSentence(sentence):

sentence = removeBadSpace(sentence)
sentence = remplaceSymbols(sentence)

words = sentence.strip().split(" ")
cleanedWords = []

for word in words:
word = replaceTs(word)
word = word.strip()
cleanedWords.append(word)

result = " ".join(cleanedWords)

if (checkSentence(result)==False):
return " "

return result

def kab(client_id, sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.

Expand All @@ -9,4 +119,5 @@ def kab(client_id, sentence):
(str): Cleaned up sentence. Returning None or a `str` of whitespace flags the sentence as invalid.
"""
# TODO: Clean up kab data
sentence = cleanSentence(sentence)
return sentence