Skip to content

Commit

Permalink
#3 getting train.py to work on json data
Browse files Browse the repository at this point in the history
  • Loading branch information
ablodge committed Jul 11, 2020
1 parent f3e8920 commit 05853d6
Show file tree
Hide file tree
Showing 13 changed files with 109 additions and 91 deletions.
2 changes: 1 addition & 1 deletion parser/AMRGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import random
from collections import defaultdict

from parser.amr import AMR
from amr import AMR

number_regexp = re.compile(r'^-?(\d)+(\.\d+)?$')
abstract_regexp0 = re.compile(r'^([A-Z]+_)+\d+$')
Expand Down
12 changes: 6 additions & 6 deletions parser/parser.py → parser/amr_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import torch.nn.functional as F
import math

from parser.encoder import WordEncoder, ConceptEncoder
from parser.decoder import DecodeLayer
from parser.transformer import Transformer, SinusoidalPositionalEmbedding, SelfAttentionMask
from parser.data import ListsToTensor, ListsofStringToTensor, DUM, NIL, PAD
from parser.search import Hypothesis, Beam, search_by_batch
from parser.utils import move_to_device
from encoder import WordEncoder, ConceptEncoder
from decoder import DecodeLayer
from transformer import Transformer, SinusoidalPositionalEmbedding, SelfAttentionMask
from data import ListsToTensor, ListsofStringToTensor, DUM, NIL, PAD
from search import Hypothesis, Beam, search_by_batch
from utils import move_to_device

class Parser(nn.Module):
def __init__(self, vocabs, word_char_dim, word_dim, pos_dim, ner_dim,
Expand Down
6 changes: 3 additions & 3 deletions parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import torch
from torch import nn
import numpy as np
from parser.AMRGraph import AMRGraph
from parser.extract import read_file
from AMRGraph import AMRGraph
from extract import read_file

PAD, UNK, DUM, NIL, END, CLS = '<PAD>', '<UNK>', '<DUMMY>', '<NULL>', '<END>', '<CLS>'
GPU_SIZE = 12000 # okay for 8G memory
Expand All @@ -14,7 +14,7 @@ def __init__(self, filename, min_occur_cnt, specials = None):
self._priority = dict()
num_tot_tokens = 0
num_vocab_tokens = 0
for line in open(filename).readlines():
for line in open(filename, encoding='utf8').readlines():
try:
token, cnt = line.rstrip('\n').split('\t')
cnt = int(cnt)
Expand Down
8 changes: 4 additions & 4 deletions parser/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
from torch import nn
import torch.nn.functional as F
from torch.nn import Parameter
from parser.data import NIL, PAD
from parser.utils import compute_f_by_tensor
from parser.transformer import MultiheadAttention, Transformer, TiedTransformer
from data import NIL, PAD
from utils import compute_f_by_tensor
from transformer import MultiheadAttention, Transformer, TiedTransformer

from parser.utils import label_smoothed_nll_loss
from utils import label_smoothed_nll_loss

class ArcGenerator(nn.Module):
def __init__(self, vocabs, embed_dim, ff_embed_dim, num_heads, dropout):
Expand Down
2 changes: 1 addition & 1 deletion parser/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from torch import nn
import torch.nn.functional as F
import re
from parser.transformer import Embedding
from transformer import Embedding

def AMREmbedding(vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None):
if pretrained_file is None:
Expand Down
11 changes: 6 additions & 5 deletions parser/extract.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python
# coding: utf-8
import os
from collections import Counter
import json, re

from parser.amr import AMR
from parser.AMRGraph import AMRGraph, number_regexp
from parser.AMRGraph import _is_abs_form
from amr import AMR
from AMRGraph import AMRGraph, number_regexp
from AMRGraph import _is_abs_form
class AMRIO:

def __init__(self):
Expand All @@ -19,7 +20,7 @@ def read(file_path):
tokens = amr_json['tokens']
lemmas = amr_json['lemmas']
pos_tags = amr_json['pos']
ner_tags = ner_tags['ner']
ner_tags = amr_json['ner']
myamr = AMRGraph.parse_json(line)
yield tokens, lemmas, pos_tags, ner_tags, myamr

Expand Down Expand Up @@ -81,7 +82,7 @@ def make_vocab(batch_seq, char_level=False):


def write_vocab(vocab, path):
with open(path, 'w') as fo:
with open(os.path.join('vocab',path), 'w', encoding='utf8') as fo:
for x, y in vocab.most_common():
fo.write('%s\t%d\n'%(x,y))

Expand Down
5 changes: 2 additions & 3 deletions parser/postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
import networkx as nx
import numpy as np

from parser.AMRGraph import is_attr_or_abs_form, need_an_instance
from AMRGraph import is_attr_or_abs_form, need_an_instance


class PostProcessor(object):
def __init__(self, rel_vocab):
self.amr = penman.AMRCodec()
self.rel_vocab = rel_vocab

def to_triple(self, res_concept, res_relation):
Expand Down Expand Up @@ -60,7 +59,7 @@ def to_triple(self, res_concept, res_relation):
return ret

def get_string(self, x):
return self.amr.encode(penman.Graph(x), top=x[0][0])
return penman.encode(penman.Graph(x), top=x[0][0])

def postprocess(self, concept, relation):
mstr = self.get_string(self.to_triple(concept, relation))
Expand Down
2 changes: 1 addition & 1 deletion parser/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def main():
amr_file = r'C:\Users\austi\Desktop\Shared Task\mrp\2020\cf\training\amr.mrp'
amr_file = r'../../mrp/2020/cf/training/amr.mrp'

nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner')

Expand Down
4 changes: 2 additions & 2 deletions parser/search.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from parser.data import END, UNK
from parser.AMRGraph import is_attr_or_abs_form
from data import END, UNK
from AMRGraph import is_attr_or_abs_form
"""
Beam search by batch
need model has two functions:
Expand Down
2 changes: 1 addition & 1 deletion parser/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def main():
amrs = []
with open(amr_file, 'r', encoding='utf8') as f:
for line in f:
amr = AMR.parse_json(line)
amr = json.loads(line)
amrs.append(amr)

drgs = []
Expand Down
130 changes: 74 additions & 56 deletions parser/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,74 +3,86 @@
import torch.multiprocessing as mp

import argparse, os, random
from parser.data import Vocab, DataLoader, DUM, END, CLS, NIL
from parser.parser import Parser
from parser.work import show_progress
from parser.extract import LexicalMap
from parser.adam import AdamWeightDecayOptimizer
from parser.utils import move_to_device
from parser.bert_utils import BertEncoderTokenizer, BertEncoder
from parser.postprocess import PostProcessor
from parser.work import parse_data
from data import Vocab, DataLoader, DUM, END, CLS, NIL
from amr_parser import Parser
from work import show_progress
from extract import LexicalMap
from adam import AdamWeightDecayOptimizer
from utils import move_to_device
from bert_utils import BertEncoderTokenizer, BertEncoder
from postprocess import PostProcessor
from work import parse_data

def parse_config():
parser = argparse.ArgumentParser()
parser.add_argument('--tok_vocab', type=str)
parser.add_argument('--lem_vocab', type=str)
parser.add_argument('--pos_vocab', type=str)
parser.add_argument('--ner_vocab', type=str)
parser.add_argument('--concept_vocab', type=str)
parser.add_argument('--predictable_concept_vocab', type=str)
parser.add_argument('--rel_vocab', type=str)
parser.add_argument('--word_char_vocab', type=str)
parser.add_argument('--concept_char_vocab', type=str)
# --lem_vocab ${dataset}/vocab/lem_vocab\
# --pos_vocab ${dataset}/vocab/pos_vocab\
# --ner_vocab ${dataset}/vocab/ner_vocab\
# --concept_vocab ${dataset}/vocab/concept_vocab\
# --predictable_concept_vocab ${dataset}/vocab/predictable_concept_vocab\
# --rel_vocab ${dataset}/vocab/rel_vocab\
# --word_char_vocab ${dataset}/vocab/word_char_vocab\
# --concept_char_vocab ${dataset}/vocab/concept_char_vocab\
# --train_data ${dataset}/amr.extended.mrp \
# --dev_data ${dataset}/amr.extended.mrp \
# --with_bert \
# --bert_path ../bert-base-cased \
# -
parser.add_argument('--tok_vocab', type=str, default='vocab/tok_vocab')
parser.add_argument('--lem_vocab', type=str, default='vocab/lem_vocab')
parser.add_argument('--pos_vocab', type=str, default='vocab/pos_vocab')
parser.add_argument('--ner_vocab', type=str, default='vocab/ner_vocab')
parser.add_argument('--concept_vocab', type=str, default='vocab/concept_vocab')
parser.add_argument('--predictable_concept_vocab', type=str, default='vocab/predictable_concept_vocab')
parser.add_argument('--rel_vocab', type=str, default='vocab/rel_vocab')
parser.add_argument('--word_char_vocab', type=str, default='vocab/word_char_vocab')
parser.add_argument('--concept_char_vocab', type=str, default='vocab/concept_char_vocab')
parser.add_argument('--pretrained_file', type=str, default=None)
parser.add_argument('--with_bert', dest='with_bert', action='store_true')
parser.add_argument('--bert_path', type=str, default=None)

parser.add_argument('--word_char_dim', type=int)
parser.add_argument('--word_dim', type=int)
parser.add_argument('--pos_dim', type=int)
parser.add_argument('--ner_dim', type=int)
parser.add_argument('--concept_char_dim', type=int)
parser.add_argument('--concept_dim', type=int)
parser.add_argument('--rel_dim', type=int)
parser.add_argument('--word_char_dim', type=int, default=32)
parser.add_argument('--word_dim', type=int, default=300)
parser.add_argument('--pos_dim', type=int, default=32)
parser.add_argument('--ner_dim', type=int, default=16)
parser.add_argument('--concept_char_dim', type=int, default=32)
parser.add_argument('--concept_dim', type=int, default=300)
parser.add_argument('--rel_dim', type=int, default=100)

parser.add_argument('--cnn_filters', type=int, nargs = '+')
parser.add_argument('--char2word_dim', type=int)
parser.add_argument('--char2concept_dim', type=int)
parser.add_argument('--cnn_filters', type=int, nargs = '+', default=(3,256))
parser.add_argument('--char2word_dim', type=int, default=128)
parser.add_argument('--char2concept_dim', type=int, default=128)
# --cnn_filter 3 256\

parser.add_argument('--embed_dim', type=int, default=512)
parser.add_argument('--ff_embed_dim', type=int, default=1024)
parser.add_argument('--num_heads', type=int, default=8)
parser.add_argument('--snt_layers', type=int, default=4)
parser.add_argument('--graph_layers', type=int, default=2)
parser.add_argument('--inference_layers', type=int, default=4)

parser.add_argument('--embed_dim', type=int)
parser.add_argument('--ff_embed_dim', type=int)
parser.add_argument('--num_heads', type=int)
parser.add_argument('--snt_layers', type=int)
parser.add_argument('--graph_layers', type=int)
parser.add_argument('--inference_layers', type=int)
parser.add_argument('--dropout', type=float, default=0.2)
parser.add_argument('--unk_rate', type=float, default=0.33)

parser.add_argument('--dropout', type=float)
parser.add_argument('--unk_rate', type=float)


parser.add_argument('--epochs', type=int)
parser.add_argument('--epochs', type=int, default=1000)
parser.add_argument('--train_data', type=str)
parser.add_argument('--dev_data', type=str)
parser.add_argument('--train_batch_size', type=int)
parser.add_argument('--batches_per_update', type=int)
parser.add_argument('--dev_batch_size', type=int)
parser.add_argument('--lr_scale', type=float)
parser.add_argument('--warmup_steps', type=int)
parser.add_argument('--train_batch_size', type=int, default=4444)
parser.add_argument('--batches_per_update', type=int, default=4)
parser.add_argument('--dev_batch_size', type=int, default=4444)
parser.add_argument('--lr_scale', type=float, default=1)
parser.add_argument('--warmup_steps', type=int, default=2000)
parser.add_argument('--resume_ckpt', type=str, default=None)
parser.add_argument('--ckpt', type=str)
parser.add_argument('--print_every', type=int)
parser.add_argument('--eval_every', type=int)

parser.add_argument('--ckpt', type=str, default='ckpt')
parser.add_argument('--print_every', type=int, default=100)
parser.add_argument('--eval_every', type=int, default=1000)

parser.add_argument('--world_size', type=int)
parser.add_argument('--gpus', type=int)
parser.add_argument('--MASTER_ADDR', type=str)
parser.add_argument('--MASTER_PORT', type=str)
parser.add_argument('--start_rank', type=int)
parser.add_argument('--world_size', type=int, default=1)
parser.add_argument('--gpus', type=int, default=1)
parser.add_argument('--MASTER_ADDR', type=str, default='localhost')
parser.add_argument('--MASTER_PORT', type=str, default='29505')
parser.add_argument('--start_rank', type=int, default=0)

return parser.parse_args()

Expand Down Expand Up @@ -124,10 +136,15 @@ def main(local_rank, args):
p.requires_grad = False

torch.manual_seed(19940117)
torch.cuda.manual_seed_all(19940117)
random.seed(19940117)
torch.cuda.set_device(local_rank)
device = torch.device('cuda', local_rank)
use_gpu = True
if torch.cuda.is_available():
torch.cuda.manual_seed_all(19940117)
torch.cuda.set_device(local_rank)
device = torch.device('cuda', local_rank)
else:
device = torch.device('cpu')
use_gpu = False

model = Parser(vocabs,
args.word_char_dim, args.word_dim, args.pos_dim, args.ner_dim,
Expand All @@ -143,7 +160,8 @@ def main(local_rank, args):
torch.cuda.manual_seed_all(19940117 + dist.get_rank())
random.seed(19940117+dist.get_rank())

model = model.cuda(local_rank)
if use_gpu:
model = model.cuda(local_rank)
dev_data = DataLoader(vocabs, lexical_mapping, args.dev_data, args.dev_batch_size, for_train=False)
pp = PostProcessor(vocabs['rel'])

Expand Down
14 changes: 7 additions & 7 deletions parser/work.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import torch

from parser.data import Vocab, DataLoader, DUM, END, CLS, NIL
from parser.parser import Parser
from parser.postprocess import PostProcessor
from parser.extract import LexicalMap
from parser.utils import move_to_device
from parser.bert_utils import BertEncoderTokenizer, BertEncoder
from parser.match import match
from data import Vocab, DataLoader, DUM, END, CLS, NIL
from amr_parser import Parser
from postprocess import PostProcessor
from extract import LexicalMap
from utils import move_to_device
from bert_utils import BertEncoderTokenizer, BertEncoder
from match import match

import argparse, os, re

Expand Down
2 changes: 1 addition & 1 deletion prepare.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dataset=$1
python3 -u -m parser.extract --train_data ${dataset}/train.txt.features.preproc
python3 -u -m parser.extract --train_data ${dataset}
mv *_vocab ${dataset}/
# python3 encoder.py
# cat ${dataset}/*embed | sort | uniq > ${dataset}/glove.embed.txt
Expand Down

0 comments on commit 05853d6

Please sign in to comment.