diff --git a/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb b/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb
index e900f265d1..3fee317af1 100644
--- a/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb
+++ b/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb
@@ -66,16 +66,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#logs_df = cudf.read_csv(\"../../datasets/training-data/log-parsing-training-data.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -120,59 +111,56 @@
" \n",
"
\n",
" \n",
- " 2410 | \n",
+ " 257 | \n",
" <NA> | \n",
" <NA> | \n",
- " 188.138.56.91 - - [17/May/2018:06:28:15 +0200]... | \n",
- " 188.138.56.91 | \n",
+ " 158.69.5.181 - - [04/Apr/2018:23:06:49 +0200] ... | \n",
+ " 158.69.5.181 | \n",
+ " - | \n",
" - | \n",
" - | \n",
- " http://www.almhuette-raith.at/ | \n",
- " Mozilla/5.0 (X11; U; Linux x86_64; de-DE; rv:1... | \n",
- " ThumbShotsBot | \n",
- " Linux | \n",
+ " - | \n",
+ " Other | \n",
+ " Other | \n",
" <NA> | \n",
" 1.1 | \n",
- " GET | \n",
- " /images/stories/slideshow/almhuette_raith_01.jpg | \n",
- " 88161 | \n",
+ " POST | \n",
+ " /administrator/index.php | \n",
+ " 4498 | \n",
" 200 | \n",
- " [17/May/2018:06:28:15 +0200] | \n",
+ " [04/Apr/2018:23:06:49 +0200] | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " error_level error_message \\\n",
- "2410 \n",
- "\n",
- " raw remote_host \\\n",
- "2410 188.138.56.91 - - [17/May/2018:06:28:15 +0200]... 188.138.56.91 \n",
+ " error_level error_message \\\n",
+ "257 \n",
"\n",
- " remote_logname remote_user request_header_referer \\\n",
- "2410 - - http://www.almhuette-raith.at/ \n",
+ " raw remote_host \\\n",
+ "257 158.69.5.181 - - [04/Apr/2018:23:06:49 +0200] ... 158.69.5.181 \n",
"\n",
- " request_header_user_agent \\\n",
- "2410 Mozilla/5.0 (X11; U; Linux x86_64; de-DE; rv:1... \n",
+ " remote_logname remote_user request_header_referer \\\n",
+ "257 - - - \n",
"\n",
- " request_header_user_agent__browser__family \\\n",
- "2410 ThumbShotsBot \n",
+ " request_header_user_agent request_header_user_agent__browser__family \\\n",
+ "257 - Other \n",
"\n",
- " request_header_user_agent__os__family \\\n",
- "2410 Linux \n",
+ " request_header_user_agent__os__family \\\n",
+ "257 Other \n",
"\n",
- " request_header_user_agent__os__version_string request_http_ver \\\n",
- "2410 1.1 \n",
+ " request_header_user_agent__os__version_string request_http_ver \\\n",
+ "257 1.1 \n",
"\n",
- " request_method request_url \\\n",
- "2410 GET /images/stories/slideshow/almhuette_raith_01.jpg \n",
+ " request_method request_url response_bytes_clf status \\\n",
+ "257 POST /administrator/index.php 4498 200 \n",
"\n",
- " response_bytes_clf status time_received \n",
- "2410 88161 200 [17/May/2018:06:28:15 +0200] "
+ " time_received \n",
+ "257 [04/Apr/2018:23:06:49 +0200] "
]
},
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -184,7 +172,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -203,7 +191,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -215,7 +203,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -227,7 +215,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -259,7 +247,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -276,6 +264,24 @@
" labels.append(labeler(indx, cols))"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model Size\n",
+ "Choose the maximum number of tokens and the overlap(stride) for your model. The tokenizer will split up logger logs and they will go through the model separately. There is a speed-tradeoff with smaller models inferencing faster, but potentially containing errors. You may need to experiment with these parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MAX_SEQ_LEN = 128\n",
+ "STRIDE = 12"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -286,7 +292,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -307,13 +313,17 @@
" for i, tag in enumerate(tags):\n",
" temp_tags.append(tag)\n",
" temp_tags.extend('X'* subword_counts[i].item())\n",
- " subword_labels.append(temp_tags)\n",
+ " if len(temp_tags) > MAX_SEQ_LEN:\n",
+ " split_temp_tags = [temp_tags[i:i+MAX_SEQ_LEN] for i in range(0, len(temp_tags)-STRIDE, MAX_SEQ_LEN-STRIDE)]\n",
+ " subword_labels.extend(split_temp_tags)\n",
+ " else:\n",
+ " subword_labels.append(temp_tags)\n",
" return subword_labels"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -339,7 +349,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -355,7 +365,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -366,11 +376,11 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
- "padded_labels = [pad(x[:256], '[PAD]', 256) for x in subword_labels]\n",
+ "padded_labels = [pad(x[:MAX_SEQ_LEN], '[PAD]', MAX_SEQ_LEN) for x in subword_labels]\n",
"int_labels = [[label2id.get(l) for l in lab] for lab in padded_labels]\n",
"label_tensor = torch.tensor(int_labels).to('cuda')"
]
@@ -385,14 +395,15 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"output = tokenizer(logs_df.raw_preprocess,\n",
- " max_length=256,\n",
- " truncation=True,\n",
- " max_num_rows = len(logs_df.raw_preprocess),\n",
+ " max_length=MAX_SEQ_LEN,\n",
+ " stride = STRIDE,\n",
+ " truncation=False,\n",
+ " max_num_rows = len(logs_df.raw_preprocess)*3,\n",
" add_special_tokens=False,\n",
" return_tensors='pt'\n",
" )\n",
@@ -403,7 +414,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -413,7 +424,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -426,7 +437,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@@ -445,14 +456,14 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']\n",
+ "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
@@ -480,7 +491,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -504,39 +515,37 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "Epoch: 0%| | 0/2 [00:00, ?it/s]/opt/conda/envs/rapids/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:64: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
- " warnings.warn('Was asked to gather along dimension 0, but all '\n",
- "Epoch: 50%|█████ | 1/2 [00:34<00:34, 34.13s/it]"
+ "Epoch: 50%|█████ | 1/2 [00:35<00:35, 35.41s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Train loss: 0.8918188477100264\n"
+ "Train loss: 0.18636336472931586\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "Epoch: 100%|██████████| 2/2 [00:58<00:00, 29.18s/it]"
+ "Epoch: 100%|██████████| 2/2 [01:10<00:00, 35.27s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Train loss: 0.027731418560799814\n",
- "CPU times: user 1min 31s, sys: 33.8 s, total: 2min 5s\n",
- "Wall time: 58.4 s\n"
+ "Train loss: 0.0059268270875965185\n",
+ "CPU times: user 44.8 s, sys: 25.7 s, total: 1min 10s\n",
+ "Wall time: 1min 10s\n"
]
},
{
@@ -587,7 +596,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -597,33 +606,33 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "f1 score: 1.000000\n",
- "Accuracy score: 1.000000\n",
+ "f1 score: 0.998655\n",
+ "Accuracy score: 0.999771\n",
" precision recall f1-score support\n",
"\n",
- " error_level 1.000 1.000 1.000 87\n",
- " error_message 1.000 1.000 1.000 87\n",
- " remote_host 1.000 1.000 1.000 905\n",
- " request_header_referer 1.000 1.000 1.000 498\n",
- " request_header_user_agent 1.000 1.000 1.000 851\n",
- "request_header_user_agent__os__version_string 1.000 1.000 1.000 18\n",
- " request_http_ver 1.000 1.000 1.000 905\n",
- " request_method 1.000 1.000 1.000 905\n",
- " request_url 1.000 1.000 1.000 905\n",
- " response_bytes_clf 1.000 1.000 1.000 904\n",
- " status 1.000 1.000 1.000 905\n",
- " time_received 1.000 1.000 1.000 964\n",
+ " error_level 1.000 1.000 1.000 100\n",
+ " error_message 1.000 1.000 1.000 100\n",
+ " remote_host 1.000 1.000 1.000 913\n",
+ " request_header_referer 1.000 1.000 1.000 508\n",
+ " request_header_user_agent 1.000 1.000 1.000 1002\n",
+ "request_header_user_agent__os__version_string 0.875 1.000 0.933 14\n",
+ " request_http_ver 1.000 1.000 1.000 913\n",
+ " request_method 1.000 1.000 1.000 913\n",
+ " request_url 0.997 0.981 0.989 913\n",
+ " response_bytes_clf 1.000 1.000 1.000 911\n",
+ " status 1.000 1.000 1.000 912\n",
+ " time_received 1.000 1.000 1.000 985\n",
"\n",
- " micro avg 1.000 1.000 1.000 7934\n",
- " macro avg 1.000 1.000 1.000 7934\n",
- " weighted avg 1.000 1.000 1.000 7934\n",
+ " micro avg 0.999 0.998 0.999 8184\n",
+ " macro avg 0.989 0.998 0.994 8184\n",
+ " weighted avg 0.999 0.998 0.999 8184\n",
"\n"
]
}