Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gagansh7171 pos en hi fg #104

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
28 changes: 14 additions & 14 deletions Code/BertToken.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score


logger = logging.getLogger(__name__)
#logger = logging.getLogger(__name__)


def set_seed(args):
Expand Down Expand Up @@ -119,10 +119,10 @@ def train(args, train_dataset, valid_dataset, model, tokenizer, labels):
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=t_total // 10, num_training_steps=t_total)

# Training
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d",
print("***** Running training *****")
print(" Num examples = %d", len(train_dataset))
print(" Num Epochs = %d", args.num_train_epochs)
print(" Instantaneous batch size per GPU = %d",
args.train_batch_size)

global_step = 0
Expand Down Expand Up @@ -175,9 +175,9 @@ def evaluate(args, model, tokenizer, labels, mode, prefix=""):
eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate)

# Evaluation
logger.info("***** Running evaluation %s *****", prefix)
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
print("***** Running evaluation %s *****", prefix)
print(" Num examples = %d", len(eval_dataset))
print(" Batch size = %d", args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = []
Expand Down Expand Up @@ -229,9 +229,9 @@ def evaluate(args, model, tokenizer, labels, mode, prefix=""):
"accuracy": accuracy_score(out_label_list, preds_list)
}

logger.info("***** Eval results %s *****", prefix)
print("***** Eval results %s *****", prefix)
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
print(" %s = %s", key, str(results[key]))

return results

Expand Down Expand Up @@ -274,7 +274,7 @@ def collate(examples):

def load_and_cache_examples(args, tokenizer, labels, mode):

logger.info("Creating features from dataset file at %s", args.data_dir)
print("Creating features from dataset file at %s", args.data_dir)
examples = read_examples_from_file(args.data_dir, mode)
features = convert_examples_to_features(examples, labels, tokenizer, args.max_seq_length)

Expand Down Expand Up @@ -352,17 +352,17 @@ def main():

model.to(args.device)

logger.info("Training/evaluation parameters %s", args)
print("Training/evaluation parameters %s", args)

train_dataset = load_and_cache_examples(
args, tokenizer, labels, mode="train")
valid_dataset = load_and_cache_examples(
args, tokenizer, labels, mode="validation")
global_step, tr_loss = train(
args, train_dataset, valid_dataset, model, tokenizer, labels)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
print(" global_step = %s, average loss = %s", global_step, tr_loss)

logger.info("Saving model checkpoint to %s", args.output_dir)
print("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`

Expand Down
8 changes: 4 additions & 4 deletions Data/Preprocess_Scripts/preprocess_pos_en_hi_ud.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def scrape_tweets(original_path):
outfile.write(i)

#scraping tweets
call(shlex.split('python crawl_tweets_copy.py -i tweet_ids_train.txt -a train-annot.json -o tweets_train.conll'))
call(shlex.split('python crawl_tweets_copy.py -i tweet_ids_dev.txt -a dev-annot.json -o tweets_dev.conll'))
call(shlex.split('python crawl_tweets_copy.py -i tweet_ids_test.txt -a test-annot.json -o tweets_test.conll'))
call(shlex.split('python3 crawl_tweets_copy.py -i tweet_ids_train.txt -a train-annot.json -o tweets_train.conll'))
call(shlex.split('python3 crawl_tweets_copy.py -i tweet_ids_dev.txt -a dev-annot.json -o tweets_dev.conll'))
call(shlex.split('python3 crawl_tweets_copy.py -i tweet_ids_test.txt -a test-annot.json -o tweets_test.conll'))

def make_files(original_path,new_path):

Expand Down Expand Up @@ -176,4 +176,4 @@ def main():
open(new_path+'Devanagari/all.txt', 'a').writelines([l for l in open(new_path+'Devanagari/validation.txt').readlines() ])

if __name__=="__main__":
main()
main()
12 changes: 6 additions & 6 deletions Data/Preprocess_Scripts/preprocess_qa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ PART1=`dirname "$INP_FILE"`
PART2=`basename "$INP_FILE"`

#preprocesss for DrQA
python $PREPROCESS_DIR/preprocess_drqa.py --data_dir $ORIGINAL_DATA_DIR
python3 $PREPROCESS_DIR/preprocess_drqa.py --data_dir $ORIGINAL_DATA_DIR

#run DrQA
git clone https://github.com/facebookresearch/DrQA.git
cd DrQA
git checkout 96f343c
pip install elasticsearch==7.8.0 nltk==3.5 scipy==1.5.0 prettytable==0.7.2 tqdm==4.46.1 regex==2020.6.8 termcolor==1.1.0 scikit-learn==0.23.1 numpy==1.18.5 torch==1.4.0
python setup.py develop
python3 setup.py develop
pip install spacy==2.3.0
python -m spacy download xx_ent_wiki_sm
python -c "import nltk;nltk.download(['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])"
python3 -m spacy download xx_ent_wiki_sm
python3 -c "import nltk;nltk.download(['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])"
./download.sh
sed -i 's/np.load(filename)/np.load(filename, allow_pickle=True)/g' drqa/retriever/utils.py
sed -i 's/\[\x27tokenizer_class\x27\], {},/\[\x27tokenizer_class\x27\], {\x27model\x27: \x27xx_ent_wiki_sm\x27},/g' scripts/distant/generate.py
Expand All @@ -30,8 +30,8 @@ patch scripts/distant/generate.py <<EOF
263a264
> random.seed(0)
EOF
python scripts/distant/generate.py $PART1 $PART2 $PREPROCESS_DIR --tokenizer spacy --dev-split 0.2 --n-docs 1 --workers 1
python3 scripts/distant/generate.py $PART1 $PART2 $PREPROCESS_DIR --tokenizer spacy --dev-split 0.2 --n-docs 1 --workers 1

cd ./..
# Squad format processor
python $PREPROCESS_DIR/preprocess_qa_en_hi.py --output_dir $PROCESSED_DIR
python3 $PREPROCESS_DIR/preprocess_qa_en_hi.py --output_dir $PROCESSED_DIR
2 changes: 1 addition & 1 deletion Data/Preprocess_Scripts/preprocess_sent_en_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,4 +215,4 @@ def main():
os.unlink('sentiment_annotated.txt')

if __name__=='__main__':
main()
main()
Loading