From b351a8f702047b381f8f02ab25cf5c540da15871 Mon Sep 17 00:00:00 2001 From: Jamie Date: Mon, 28 Jan 2019 00:20:19 +0900 Subject: [PATCH] =?UTF-8?q?=EC=BD=94=ED=8D=BC=EC=8A=A4=EC=9D=98=20?= =?UTF-8?q?=ED=8C=A8=EC=B9=98=EB=A5=BC=20=EC=83=9D=EC=84=B1=ED=95=98?= =?UTF-8?q?=EA=B3=A0=20=EC=A0=81=EC=9A=A9=ED=95=98=EB=8A=94=20=EC=8A=A4?= =?UTF-8?q?=ED=81=AC=EB=A6=BD=ED=8A=B8=20=EC=B6=94=EA=B0=80=20#3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- munjong/bin/apply_patch.py | 79 ++++ munjong/bin/make_patch.py | 82 ++++ src/main/python/khaiii/munjong/libpatch.py | 426 +++++++++++++++++++++ 3 files changed, 587 insertions(+) create mode 100755 munjong/bin/apply_patch.py create mode 100755 munjong/bin/make_patch.py create mode 100644 src/main/python/khaiii/munjong/libpatch.py diff --git a/munjong/bin/apply_patch.py b/munjong/bin/apply_patch.py new file mode 100755 index 0000000..44562ec --- /dev/null +++ b/munjong/bin/apply_patch.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +apply patch to original Sejong corpus +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser, Namespace +import logging +import os +import shutil + +from khaiii.munjong import libpatch + + +############# +# functions # +############# +def run(args: Namespace): + """ + run function which is the start point of program + Args: + args: program arguments + """ + if not os.path.exists(args.modified): + logging.info('creating modified corpus dir: %s', args.modified) + os.mkdir(args.modified) + + for name in sorted(os.listdir(args.original)): + if not name.endswith('.txt'): + continue + org_path = '%s/%s' % (args.original, name) + mod_path = '%s/%s' % (args.modified, name) + patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')]) + if os.path.exists(patch_path): + logging.info('[%s] + [%s] = [%s]', org_path, patch_path, mod_path) + libpatch.apply(org_path, args.org_enc, patch_path, mod_path, args.mod_enc) + else: + logging.info('[%s] = [%s]', org_path, mod_path) + shutil.copyfile(org_path, mod_path) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='apply patch to original Sejong corpus') + parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR', + required=True) + parser.add_argument('-p', '--patch', help='patch dir', metavar='DIR', required=True) + parser.add_argument('-m', '--modified', help='modified corpus output dir', metavar='DIR', + required=True) + parser.add_argument('--org-enc', help='original corpus encoding ', + metavar='ENCODING', default='UTF-16') + parser.add_argument('--mod-enc', help='modified corpus encoding ', + metavar='ENCODING', default='UTF-8') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run(args) + + +if __name__ == '__main__': + main() diff --git a/munjong/bin/make_patch.py b/munjong/bin/make_patch.py new file mode 100755 index 0000000..25e3bd1 --- /dev/null +++ b/munjong/bin/make_patch.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +make patch from two Sejong corpora +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser, Namespace +import logging +import os + +from khaiii.munjong import libpatch + + +############# +# functions # +############# +def run(args: Namespace): + """ + run function which is the start point of program + Args: + args: program arguments + """ + if not os.path.exists(args.patch): + logging.info('creating patch dir: %s', args.patch) + os.mkdir(args.patch) + + for name in sorted(os.listdir(args.original)): + if not name.endswith('.txt'): + continue + org_path = '%s/%s' % (args.original, name) + mod_path = '%s/%s' % (args.modified, name) + patch_path = '%s/%s.patch' % (args.patch, name[:-len('.txt')]) + logging.info('[%s] - [%s] = [%s]', org_path, mod_path, patch_path) + patches = libpatch.make(org_path, args.org_enc, mod_path, args.mod_enc) + if patches: + logging.info('creating patch file: %s', patch_path) + with open(patch_path, 'w', encoding='UTF-8') as fout: + for patch in patches: + print(patch, file=fout) + elif os.path.exists(patch_path): + logging.info('removing existing patch file: %s', patch_path) + os.remove(patch_path) + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='make patch from two Sejong corpora') + parser.add_argument('-o', '--original', help='original corpus dir', metavar='DIR', + required=True) + parser.add_argument('-m', '--modified', help='modified corpus dir', metavar='DIR', + required=True) + parser.add_argument('-p', '--patch', help='patch output dir', metavar='DIR', required=True) + parser.add_argument('--org-enc', help='original corpus encoding ', + metavar='ENCODING', default='UTF-16') + parser.add_argument('--mod-enc', help='modified corpus encoding ', + metavar='ENCODING', default='UTF-8') + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run(args) + + +if __name__ == '__main__': + main() diff --git a/src/main/python/khaiii/munjong/libpatch.py b/src/main/python/khaiii/munjong/libpatch.py new file mode 100644 index 0000000..13e3ecf --- /dev/null +++ b/src/main/python/khaiii/munjong/libpatch.py @@ -0,0 +1,426 @@ +# -*- coding: utf-8 -*- + + +""" +patch library for Sejong corpus +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from collections import namedtuple +import logging +import os +from typing import Dict, List, Tuple + +from khaiii.munjong.sejong_corpus import SENT_OPEN_TAGS, SENT_CLOSE_TAGS, WORD_ID_PTN + + +############# +# constatns # +############# +# line types +WORD_TYPE = 1 # word (eojeol) +BOS_TYPE = 2 # begin of sentence markup +EOS_TYPE = 3 # end of sentence markup +MISC_TYPE = 0 # other miscellaneous lines + + +######### +# types # +######### +Line = namedtuple('Line', ['type_', 'wid', 'content']) + + +class Patch: # pylint: disable=too-few-public-methods + """ + patch line + """ + # patch types (actions, operations) + SENT_SPLIT = 1 + SENT_MERGE = 2 + WORD_REPLACE = 3 + WORD_INSERT = 4 + WORD_DELETE = 5 + # patch categories + SENT_CATE = 1 + WORD_CATE = 2 + + def __init__(self, type_: int, wid: str, content: str): + """ + Args: + type_: patch type + wid: word ID + content: word content (tab separated second column) + """ + self.type_ = type_ + self.wid = wid + self.content = content + + def __eq__(self, other: 'Patch') -> bool: + return self.type_ == other.type_ and self.wid == other.wid and self.content == other.content + + def __cmp__(self, other: 'Patch') -> int: + if self.cate() == other.cate(): + if self.wid < other.wid: + return -1 + if self.wid > other.wid: + return 1 + return 0 + return self.cate() - other.cate() + + def __lt__(self, other: 'Patch') -> bool: + return self.__cmp__(other) < 0 + + def __str__(self) -> str: + if self.type_ == self.WORD_REPLACE: + return '=\t%s\t%s' % (self.wid, self.content) + if self.type_ == self.WORD_INSERT: + return '+\t%s\t%s' % (self.wid, self.content) + if self.type_ == self.WORD_DELETE: + return '-\t%s' % self.wid + if self.type_ == self.SENT_SPLIT: + return 'S\t%s\t%s' % (self.wid, self.content) + if self.type_ == self.SENT_MERGE: + return 'M\t%s\t%s' % (self.wid, self.content) + raise RuntimeError('unknown patch type: %d' % self.type_) + + def cate(self) -> int: + """ + get patch category. EOS/BOS patches are 0. word patches are 1 + Returns: + category number + """ + return self.SENT_CATE if self.type_ in [self.SENT_SPLIT, self.SENT_MERGE] \ + else self.WORD_CATE + + @classmethod + def parse(cls, line: str) -> 'Patch': + """ + parse patch line + Args: + line: patch line + Returns: + Patch object + """ + cols = line.split('\t', 2) + if len(cols) == 3: + if cols[0] == '=': + return Patch(cls.WORD_REPLACE, cols[1], cols[2]) + if cols[0] == '+': + return Patch(cls.WORD_INSERT, cols[1], cols[2]) + if cols[0].upper() == 'M': + return Patch(cls.SENT_MERGE, cols[1], cols[2]) + if cols[0].upper() == 'S': + return Patch(cls.SENT_SPLIT, cols[1], cols[2]) + elif len(cols) == 2: + if cols[0] == '-': + return Patch(cls.WORD_DELETE, cols[1], '') + raise RuntimeError('invalid patch format: %s' % line) + + +############# +# functions # +############# +def _load_corpus(path: str, enc: str) -> Tuple[List[Line], Dict[str, int]]: + """ + load corpus + Args: + path: file path + enc: file encoding + Returns: + list of lines + word ID dic + """ + lines = [] + wid_dic = {} + for line in open(path, 'r', encoding=enc): + line = line.rstrip('\r\n') + if WORD_ID_PTN.match(line): + wid, content = line.split('\t', 1) + if wid in wid_dic: + raise RuntimeError('duplicated word ID: %s' % line) + else: + wid_dic[wid] = len(lines) + lines.append(Line(WORD_TYPE, wid, content)) + elif line in SENT_OPEN_TAGS: + lines.append(Line(BOS_TYPE, None, line)) + elif line in SENT_CLOSE_TAGS: + lines.append(Line(EOS_TYPE, None, line)) + else: + lines.append(Line(MISC_TYPE, None, line)) + if line == '' and 'BTJO0443.txt' in path: + break + return lines, wid_dic + + +def _make_sent_patch(org_lines: List[Line], mod_lines: List[Line]) -> List[Patch]: + """ + make EOS/BOS patch + Args: + org_lines: original lines + mod_lines: modified lines + Returns: + EOS/BOS patches + """ + def _get_eos_bos(lines: List[Line]) -> Tuple[str, str]: + """ + get all EOS/BOS from lines + Args: + lines: lines of corpus + Returns: + prev. word ID + next word ID + """ + eos_bos = [] + for idx, line in enumerate(lines): + if line.type_ == EOS_TYPE and (idx+2) < len(lines) and lines[idx+1].type_ == BOS_TYPE \ + and lines[idx-1].type_ == WORD_TYPE and lines[idx+2].type_ == WORD_TYPE: + prev_wid = lines[idx-1].wid + next_wid = lines[idx+2].wid + eos_bos.append((prev_wid[:17], next_wid[:17])) + return eos_bos + + org_eos_bos = set(_get_eos_bos(org_lines)) + mod_eos_bos = set(_get_eos_bos(mod_lines)) + patches = [] + for prev_wid, next_wid in org_eos_bos - mod_eos_bos: + patches.append(Patch(Patch.SENT_MERGE, prev_wid, next_wid)) + for prev_wid, next_wid in mod_eos_bos - org_eos_bos: + patches.append(Patch(Patch.SENT_SPLIT, prev_wid, next_wid)) + return sorted(patches) + + +def _make_word_patch(org_lines: List[Line], org_wid_dic: Dict[str, int], mod_lines: List[Line], + mod_wid_dic: Dict[str, int]) -> List[Patch]: + """ + make word patch + Args: + org_lines: original lines + org_wid_dic: original word ID dictionary + mod_lines: modified lines + mod_wid_dic: modified word ID dictionary + Returns: + word patches + """ + patches = [] + for org_line in org_lines: + if org_line.type_ != WORD_TYPE: + continue + if org_line.wid in mod_wid_dic: + mod_line = mod_lines[mod_wid_dic[org_line.wid]] + if org_line.content != mod_line.content: + patches.append(Patch(Patch.WORD_REPLACE, mod_line.wid, mod_line.content)) + else: + patches.append(Patch(Patch.WORD_DELETE, org_line.wid, '')) + for mod_line in mod_lines: + if mod_line.type_ != WORD_TYPE or mod_line.wid in org_wid_dic: + continue + patches.append(Patch(Patch.WORD_INSERT, mod_line.wid, mod_line.content)) + return sorted(patches) + + +def make(org_path: str, org_enc: str, mod_path: str, mod_enc: str) -> List[Patch]: + """ + make patch from two file + Args: + org_path: original file path + org_enc: original file encoding + mod_path: modified file path + mod_enc: modified file encoding + Returns: + patch contents (list of patch lines) + """ + org_lines, org_wid_dic = _load_corpus(org_path, org_enc) + mod_lines, mod_wid_dic = _load_corpus(mod_path, mod_enc) + + sent_patches = _make_sent_patch(org_lines, mod_lines) + word_patches = _make_word_patch(org_lines, org_wid_dic, mod_lines, mod_wid_dic) + return sent_patches + word_patches + + +def _load_patch(patch_path: str) -> List[Patch]: + """ + load patch from file + Args: + patch_path: patch path + Returns: + patches + """ + patch_name = os.path.basename(patch_path) + wid_dic = set() + patches = [] + for line_num, line in enumerate(open(patch_path, 'r', encoding='UTF-8'), start=1): + line = line.rstrip('\r\n') + if not line: + continue + patch = Patch.parse(line) + if patch.cate() == Patch.SENT_CATE: + if (patch.wid, patch.content) in wid_dic: + logging.error('%s(%d): patch conflict: %s', patch_name, line_num, line) + else: + patches.append(patch) + wid_dic.add((patch.wid, patch.content)) + else: + if patch.wid in wid_dic: + logging.error('%s(%d): patch conflict: %s', patch_name, line_num, line) + else: + patches.append(patch) + wid_dic.add(patch.wid) + return patches + + +def _apply_sent_merge_patch(org_lines: List[Line], patches: List[Patch]) -> List[Line]: + """ + apply merge EOS/BOS patches + Args: + org_lines: original lines + patches: patches + Returns: + modified lines + """ + merge_patches = {patch.wid: patch.content for patch in patches \ + if patch.type_ == Patch.SENT_MERGE} + mod_lines = [] + idx = 0 + while idx < len(org_lines): + org_line = org_lines[idx] + if org_line.type_ == EOS_TYPE and org_lines[idx+1].type_ == BOS_TYPE and \ + org_lines[idx-1].type_ == WORD_TYPE and org_lines[idx+2].type_ == WORD_TYPE: + prev_wid = org_lines[idx-1].wid[:17] + next_wid = org_lines[idx+2].wid[:17] + if prev_wid in merge_patches and merge_patches[prev_wid] == next_wid: + del merge_patches[prev_wid] + idx += 2 + continue + mod_lines.append(org_line) + idx += 1 + if merge_patches: + for prev_wid, next_wid in merge_patches.items(): + logging.error('remaining merge sentence patches: %s\t%s', prev_wid, next_wid) + return mod_lines + + +def _build_wid_dic(lines: List[Line]) -> Dict[str, int]: + """ + build word ID dictionary + Args: + lines: lines of corpus + Returns: + word ID dictionary + """ + return {line.wid: idx for idx, line in enumerate(lines) if line.type_ == WORD_TYPE} + + +def _apply_sent_split_patch(mod_lines: List[Line], patches: List[Patch]) -> List[Line]: + """ + apply split EOS/BOS patches + Args: + mod_lines: modified lines + patches: patches + Returns: + modified lines + """ + mod_wid_dic = _build_wid_dic(mod_lines) + split_patches = {patch.wid: patch.content for patch in patches \ + if patch.type_ == Patch.SENT_SPLIT} + for prev_wid, next_wid in sorted(split_patches.items(), key=lambda x: x[0], reverse=True): + idx = mod_wid_dic[next_wid] + assert mod_lines[idx].wid == next_wid + if mod_lines[idx-1].type_ == WORD_TYPE and mod_lines[idx-1].wid[:17] == prev_wid: + mod_lines.insert(idx, Line(BOS_TYPE, None, '

')) + mod_lines.insert(idx, Line(EOS_TYPE, None, '

')) + del split_patches[prev_wid] + if split_patches: + for prev_wid, next_wid in split_patches.items(): + logging.error('remaining split sentence patches: %s\t%s', prev_wid, next_wid) + return mod_lines + + +def _apply_word_del_rep_patch(mod_lines: List[Line], patches: List[Patch]) -> List[Line]: + """ + apply word delete/replace patches + Args: + mod_lines: modified lines + patches: patches + Returns: + modified lines + """ + delete_patches = {patch.wid for patch in patches if patch.type_ == Patch.WORD_DELETE} + replace_patches = {patch.wid: patch.content for patch in patches \ + if patch.type_ == Patch.WORD_REPLACE} + new_lines = [] + for line in mod_lines: + if line.type_ == WORD_TYPE: + if line.wid in delete_patches: + delete_patches.remove(line.wid) + continue + elif line.wid in replace_patches: + new_lines.append(Line(WORD_TYPE, line.wid, replace_patches[line.wid])) + del replace_patches[line.wid] + continue + new_lines.append(line) + if delete_patches: + for wid in delete_patches: + logging.error('remaining delete word patches: %s', wid) + if replace_patches: + for wid, content in replace_patches.items(): + logging.error('remaining replace word patches: %s\t%s', wid, content) + return new_lines + + +def _apply_word_insert_patch(mod_lines: List[Line], patches: List[Patch]) -> List[Line]: + """ + apply word insert patches + Args: + mod_lines: modified lines + patches: patches + Returns: + modified lines + """ + insert_patches = sorted([(patch.wid, patch.content) for patch in patches \ + if patch.type_ == Patch.WORD_INSERT]) + prev_idx = -1 + curr_idx = 0 + while curr_idx < len(mod_lines) and insert_patches: + curr_word = mod_lines[curr_idx] + if curr_word.type_ != WORD_TYPE: + curr_idx += 1 + continue + wid, content = insert_patches[0] + if curr_word.wid < wid: + prev_idx = curr_idx + curr_idx += 1 + continue + mod_lines.insert(prev_idx+1, Line(WORD_TYPE, wid, content)) + del insert_patches[0] + prev_idx += 1 + curr_idx += 1 + return mod_lines + + +def apply(org_path: str, org_enc: str, patch_path: str, mod_path: str, mod_enc: str): + """ + apply path to original corpus then get modified corpus + Args: + org_path: original file path + org_enc: original file encoding + patch_path: patch file path + mod_path: modified file path + mod_enc: modified file encoding + """ + patches = _load_patch(patch_path) + org_lines, _ = _load_corpus(org_path, org_enc) + mod_lines = _apply_sent_merge_patch(org_lines, patches) + mod_lines = _apply_word_del_rep_patch(mod_lines, patches) + mod_lines = _apply_sent_split_patch(mod_lines, patches) + mod_lines = _apply_word_insert_patch(mod_lines, patches) + with open(mod_path, 'w', encoding=mod_enc) as fout: + for mod_line in mod_lines: + if mod_line.type_ == WORD_TYPE: + print('%s\t%s' % (mod_line.wid, mod_line.content), file=fout) + else: + print(mod_line.content, file=fout)