diff --git a/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb b/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb index e900f265d1..3fee317af1 100644 --- a/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb +++ b/models/training-tuning-scripts/log-parsing-models/log-parsing-training.ipynb @@ -66,16 +66,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#logs_df = cudf.read_csv(\"../../datasets/training-data/log-parsing-training-data.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -120,59 +111,56 @@ " \n", " \n", " \n", - " 2410\n", + " 257\n", " <NA>\n", " <NA>\n", - " 188.138.56.91 - - [17/May/2018:06:28:15 +0200]...\n", - " 188.138.56.91\n", + " 158.69.5.181 - - [04/Apr/2018:23:06:49 +0200] ...\n", + " 158.69.5.181\n", + " -\n", " -\n", " -\n", - " http://www.almhuette-raith.at/\n", - " Mozilla/5.0 (X11; U; Linux x86_64; de-DE; rv:1...\n", - " ThumbShotsBot\n", - " Linux\n", + " -\n", + " Other\n", + " Other\n", " <NA>\n", " 1.1\n", - " GET\n", - " /images/stories/slideshow/almhuette_raith_01.jpg\n", - " 88161\n", + " POST\n", + " /administrator/index.php\n", + " 4498\n", " 200\n", - " [17/May/2018:06:28:15 +0200]\n", + " [04/Apr/2018:23:06:49 +0200]\n", " \n", " \n", "\n", "" ], "text/plain": [ - " error_level error_message \\\n", - "2410 \n", - "\n", - " raw remote_host \\\n", - "2410 188.138.56.91 - - [17/May/2018:06:28:15 +0200]... 188.138.56.91 \n", + " error_level error_message \\\n", + "257 \n", "\n", - " remote_logname remote_user request_header_referer \\\n", - "2410 - - http://www.almhuette-raith.at/ \n", + " raw remote_host \\\n", + "257 158.69.5.181 - - [04/Apr/2018:23:06:49 +0200] ... 158.69.5.181 \n", "\n", - " request_header_user_agent \\\n", - "2410 Mozilla/5.0 (X11; U; Linux x86_64; de-DE; rv:1... \n", + " remote_logname remote_user request_header_referer \\\n", + "257 - - - \n", "\n", - " request_header_user_agent__browser__family \\\n", - "2410 ThumbShotsBot \n", + " request_header_user_agent request_header_user_agent__browser__family \\\n", + "257 - Other \n", "\n", - " request_header_user_agent__os__family \\\n", - "2410 Linux \n", + " request_header_user_agent__os__family \\\n", + "257 Other \n", "\n", - " request_header_user_agent__os__version_string request_http_ver \\\n", - "2410 1.1 \n", + " request_header_user_agent__os__version_string request_http_ver \\\n", + "257 1.1 \n", "\n", - " request_method request_url \\\n", - "2410 GET /images/stories/slideshow/almhuette_raith_01.jpg \n", + " request_method request_url response_bytes_clf status \\\n", + "257 POST /administrator/index.php 4498 200 \n", "\n", - " response_bytes_clf status time_received \n", - "2410 88161 200 [17/May/2018:06:28:15 +0200] " + " time_received \n", + "257 [04/Apr/2018:23:06:49 +0200] " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -184,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -203,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -215,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -227,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -259,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -276,6 +264,24 @@ " labels.append(labeler(indx, cols))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Size\n", + "Choose the maximum number of tokens and the overlap(stride) for your model. The tokenizer will split up logger logs and they will go through the model separately. There is a speed-tradeoff with smaller models inferencing faster, but potentially containing errors. You may need to experiment with these parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "MAX_SEQ_LEN = 128\n", + "STRIDE = 12" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -286,7 +292,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -307,13 +313,17 @@ " for i, tag in enumerate(tags):\n", " temp_tags.append(tag)\n", " temp_tags.extend('X'* subword_counts[i].item())\n", - " subword_labels.append(temp_tags)\n", + " if len(temp_tags) > MAX_SEQ_LEN:\n", + " split_temp_tags = [temp_tags[i:i+MAX_SEQ_LEN] for i in range(0, len(temp_tags)-STRIDE, MAX_SEQ_LEN-STRIDE)]\n", + " subword_labels.extend(split_temp_tags)\n", + " else:\n", + " subword_labels.append(temp_tags)\n", " return subword_labels" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -339,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -355,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -366,11 +376,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "padded_labels = [pad(x[:256], '[PAD]', 256) for x in subword_labels]\n", + "padded_labels = [pad(x[:MAX_SEQ_LEN], '[PAD]', MAX_SEQ_LEN) for x in subword_labels]\n", "int_labels = [[label2id.get(l) for l in lab] for lab in padded_labels]\n", "label_tensor = torch.tensor(int_labels).to('cuda')" ] @@ -385,14 +395,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "output = tokenizer(logs_df.raw_preprocess,\n", - " max_length=256,\n", - " truncation=True,\n", - " max_num_rows = len(logs_df.raw_preprocess),\n", + " max_length=MAX_SEQ_LEN,\n", + " stride = STRIDE,\n", + " truncation=False,\n", + " max_num_rows = len(logs_df.raw_preprocess)*3,\n", " add_special_tokens=False,\n", " return_tensors='pt'\n", " )\n", @@ -403,7 +414,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -413,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -426,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -445,14 +456,14 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']\n", + "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']\n", "- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", @@ -480,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -504,39 +515,37 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 0%| | 0/2 [00:00