Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/KosmonautX/lmaozhedong i…
Browse files Browse the repository at this point in the history
…nto develop
  • Loading branch information
kraftedcheese committed Nov 23, 2021
2 parents 915567f + d6f4b34 commit 0fd7778
Show file tree
Hide file tree
Showing 2 changed files with 850 additions and 111 deletions.
180 changes: 69 additions & 111 deletions BERT-NER-dev/roll_call.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,129 +2,87 @@
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/princeton/.local/lib/python3.9/site-packages/torch/cuda/__init__.py:80: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:112.)\n",
" return torch._C._cuda_getDeviceCount() > 0\n"
]
}
],
"source": [
"from bert import Ner\n",
"import pandas as pd\n",
"df = pd.read_csv(\"./z_processed-data.csv\")\n",
"model = Ner(\"l/out_base/\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from nltk import ToktokTokenizer\n",
"from nltk.tokenize import sent_tokenize\n",
"\n",
"def sentenced(x):\n",
" entities = []\n",
" count = 0 \n",
" li = sent_tokenize(x)\n",
" for sentence in li:\n",
" if len(sentence) < 512:\n",
" atomised = model.predict(sentence)\n",
" for atoms in atomised :\n",
" if atoms['tag'] != 'O':\n",
" entities.append(atoms)\n",
" return entities\n",
"\n",
"df[\"entities\"] = df[\"3\"].apply(lambda x: sentenced(str(x)))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('./y_processed_dat.csv')\n",
"\n",
"\n",
"# lol = df[\"3\"][0]\n",
"#sentenced(lol)\n",
"# lol\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>secondary_data</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Adzhubei__Aleksei</td>\n",
" <td>March_12__1962</td>\n",
" <td>Alexei_Adzhubei-s_Account_of_His_Visit_to_Wash...</td>\n",
" <td>TOP SECRETDuring my visit to Washington, Brazi...</td>\n",
" <td>TOP SECRETDuring my visit to Washington Brazil...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Albania__Ministry_of_Foreign_Affairs</td>\n",
" <td>March_15__1971</td>\n",
" <td>Notes_on_a_Bulletin_of_the_Korean_News_Agency_0</td>\n",
" <td>[Handwritten document] Note Looking at the bul...</td>\n",
" <td>Handwritten document Note Looking at the bul...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Alexandru__Boaba</td>\n",
" <td>April_06__1978</td>\n",
" <td>TELEGRAM_075_205_from_the_Romanian_Embassy_in_...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Anda__Torleiv__1921-</td>\n",
" <td>October_21__1976</td>\n",
" <td>Telegram_from_the_Embassy_in_Beijing__-Smuggli...</td>\n",
" <td>ROYAL MINISTRY OF FOREIGN AFFAIRSCOPY NO:1: MI...</td>\n",
" <td>ROYAL MINISTRY OF FOREIGN AFFAIRSCOPY NO:1: MI...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>Anda__Torleiv__1921-</td>\n",
" <td>October_22__1976</td>\n",
" <td>Telegram_from_Norwegian_Ambassador_to_China_to...</td>\n",
" <td>22.10.76, 09.34 amCOPY NO:1: MINISTER OF FOREI...</td>\n",
" <td>221076 0934 amCOPY NO:1: MINISTER OF FOREIGN A...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 0 1 \\\n",
"0 0 Adzhubei__Aleksei March_12__1962 \n",
"1 1 Albania__Ministry_of_Foreign_Affairs March_15__1971 \n",
"2 2 Alexandru__Boaba April_06__1978 \n",
"3 3 Anda__Torleiv__1921- October_21__1976 \n",
"4 4 Anda__Torleiv__1921- October_22__1976 \n",
"\n",
" 2 \\\n",
"0 Alexei_Adzhubei-s_Account_of_His_Visit_to_Wash... \n",
"1 Notes_on_a_Bulletin_of_the_Korean_News_Agency_0 \n",
"2 TELEGRAM_075_205_from_the_Romanian_Embassy_in_... \n",
"3 Telegram_from_the_Embassy_in_Beijing__-Smuggli... \n",
"4 Telegram_from_Norwegian_Ambassador_to_China_to... \n",
"\n",
" 3 \\\n",
"0 TOP SECRETDuring my visit to Washington, Brazi... \n",
"1 [Handwritten document] Note Looking at the bul... \n",
"2 NaN \n",
"3 ROYAL MINISTRY OF FOREIGN AFFAIRSCOPY NO:1: MI... \n",
"4 22.10.76, 09.34 amCOPY NO:1: MINISTER OF FOREI... \n",
"\n",
" secondary_data \n",
"0 TOP SECRETDuring my visit to Washington Brazil... \n",
"1 Handwritten document Note Looking at the bul... \n",
"2 NaN \n",
"3 ROYAL MINISTRY OF FOREIGN AFFAIRSCOPY NO:1: MI... \n",
"4 221076 0934 amCOPY NO:1: MINISTER OF FOREIGN A... "
"[{'word': 'Hello', 'tag': 'O', 'confidence': 0.99978107213974},\n",
" {'word': 'from', 'tag': 'O', 'confidence': 0.9994962215423584},\n",
" {'word': 'Obama', 'tag': 'B-PER', 'confidence': 0.949417769908905},\n",
" {'word': 'you', 'tag': 'O', 'confidence': 0.9999090433120728},\n",
" {'word': 'English', 'tag': 'B-MISC', 'confidence': 0.9995313882827759}]"
]
},
"execution_count": 3,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from bert import Ner\n",
"import pandas as pd\n",
"df = pd.read_csv(\"./z_processed-data.csv\")\n",
"#model = Ner(\"l/out_base/\")\n",
"df[\"entities\"] = df[\"secondary_data\"].apply(model.predict())"
"org_chart = pd.read_csv(\"./merged_political_military.csv\")\n",
"org_chart.head()\n",
"# output"
]
}
],
Expand Down
781 changes: 781 additions & 0 deletions BERT-NER-dev/y_processed_dat.csv

Large diffs are not rendered by default.

0 comments on commit 0fd7778

Please sign in to comment.