From 918f6a23f31fd448724013b7d39bbb50034f8f06 Mon Sep 17 00:00:00 2001 From: Marius Ciepluch <11855163+norandom@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:10:20 +0200 Subject: [PATCH] Docu update, further cuml tests --- ...AE_sysmon_dataset_(Excel_implant_C2).ipynb | 3756 +++++++---------- 1 file changed, 1493 insertions(+), 2263 deletions(-) diff --git a/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb b/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb index 7179b49..ad3a955 100644 --- a/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb +++ b/LinFormer_AutoML_on_AE_sysmon_dataset_(Excel_implant_C2).ipynb @@ -5,7 +5,8 @@ "colab": { "provenance": [], "machine_shape": "hm", - "gpuType": "T4" + "gpuType": "A100", + "toc_visible": true }, "kernelspec": { "name": "python3", @@ -701,7 +702,7 @@ "description_width": "" } }, - "2cb39463ffff4f9f8c6c6ece6f0e2769": { + "873451c433c1441796e0f2c582b54edc": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", @@ -716,14 +717,14 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_c51ec13acbd346de98bf66fd543c8ddd", - "IPY_MODEL_b325e19f0a7c4baebd5660a78e45131a", - "IPY_MODEL_3878d666a20c45d995dfc79b02c2c165" + "IPY_MODEL_12bd969e54e64b17b22957ca3da533fa", + "IPY_MODEL_4dd1615acf4645d7861226a3da60ae27", + "IPY_MODEL_b5c015fcd9bd4461885e5e5c7f49fa9a" ], - "layout": "IPY_MODEL_e2aee7171ace4dc0bb8cd09220db5d12" + "layout": "IPY_MODEL_c46835dbd92c49d6aef630a6d74f8588" } }, - "c51ec13acbd346de98bf66fd543c8ddd": { + "12bd969e54e64b17b22957ca3da533fa": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -738,13 +739,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_48622890a0254e49b2d26af09626560f", + "layout": "IPY_MODEL_8bf87a1e0740469cbea6f0c76dd37735", "placeholder": "​", - "style": "IPY_MODEL_bcc8ab2a76e54b20a36e8b953a78f0e4", - "value": "100%" + "style": "IPY_MODEL_cc4bbcf9f0c44cd69117dc6251954049", + "value": "Optimization Progress: 100%" } }, - "b325e19f0a7c4baebd5660a78e45131a": { + "4dd1615acf4645d7861226a3da60ae27": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", @@ -757,18 +758,18 @@ "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", - "bar_style": "success", + "bar_style": "", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_4dcbd8944dcc4ea1b6992ad7740da36e", - "max": 8039037, + "layout": "IPY_MODEL_b988186cafdf4da2a30bcff8a6a00f70", + "max": 20, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_961a05a5d5b446699f82905129b7cd19", - "value": 8039037 + "style": "IPY_MODEL_f998e026eba0488288515e8b69eacc96", + "value": 20 } }, - "3878d666a20c45d995dfc79b02c2c165": { + "b5c015fcd9bd4461885e5e5c7f49fa9a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -783,13 +784,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_8454cae7dcf54932a356ad972733775f", + "layout": "IPY_MODEL_0df60724539b4708a9323dbb479fa044", "placeholder": "​", - "style": "IPY_MODEL_5a0d14012c40493081d1cca56b5ffde2", - "value": " 8.04M/8.04M [00:00<00:00, 70.2MiB/s]" + "style": "IPY_MODEL_dad976092fff4859aaf73eb9d85aa01b", + "value": " 120/120 [07:16<00:00,  3.02s/pipeline]" } }, - "e2aee7171ace4dc0bb8cd09220db5d12": { + "c46835dbd92c49d6aef630a6d74f8588": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -837,11 +838,11 @@ "padding": null, "right": null, "top": null, - "visibility": null, + "visibility": "hidden", "width": null } }, - "48622890a0254e49b2d26af09626560f": { + "8bf87a1e0740469cbea6f0c76dd37735": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -893,7 +894,7 @@ "width": null } }, - "bcc8ab2a76e54b20a36e8b953a78f0e4": { + "cc4bbcf9f0c44cd69117dc6251954049": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -908,7 +909,7 @@ "description_width": "" } }, - "4dcbd8944dcc4ea1b6992ad7740da36e": { + "b988186cafdf4da2a30bcff8a6a00f70": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -960,7 +961,7 @@ "width": null } }, - "961a05a5d5b446699f82905129b7cd19": { + "f998e026eba0488288515e8b69eacc96": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", @@ -976,7 +977,7 @@ "description_width": "" } }, - "8454cae7dcf54932a356ad972733775f": { + "0df60724539b4708a9323dbb479fa044": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1028,7 +1029,7 @@ "width": null } }, - "5a0d14012c40493081d1cca56b5ffde2": { + "dad976092fff4859aaf73eb9d85aa01b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -1043,7 +1044,7 @@ "description_width": "" } }, - "6043ef5c70a3493d8f45d8231726d2a6": { + "4b8ebee65b2e4789bce617bedb691cb7": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", @@ -1058,14 +1059,14 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_317e803897b64bc58bb4b7d9d0ef2504", - "IPY_MODEL_97270bdece1442719201203cfe72f262", - "IPY_MODEL_ef2ae2c4892247919876aa97f42a3561" + "IPY_MODEL_a2a4e23c74fb4892be3f9edd122efd35", + "IPY_MODEL_1abae52fd84841759c034f9fb78c65a3", + "IPY_MODEL_36359ad021e64e6eb733ae740d987e68" ], - "layout": "IPY_MODEL_e5482a1a80244f039aeadfd8fe87534f" + "layout": "IPY_MODEL_80d36d8a650f42d8a990e04014e937a1" } }, - "317e803897b64bc58bb4b7d9d0ef2504": { + "a2a4e23c74fb4892be3f9edd122efd35": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -1080,13 +1081,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_1c213845078846779f14d6143da25162", + "layout": "IPY_MODEL_751c1855804a48ff9f0d6496a5e29506", "placeholder": "​", - "style": "IPY_MODEL_805d5e39429847c893d38d02524b6a59", - "value": "100%" + "style": "IPY_MODEL_4749064df71f48f4b947ba208acbac26", + "value": "Optimization Progress: 100%" } }, - "97270bdece1442719201203cfe72f262": { + "1abae52fd84841759c034f9fb78c65a3": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", @@ -1099,18 +1100,18 @@ "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", - "bar_style": "success", + "bar_style": "", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_09e0bcda1a3645268fc0ec1b7540538f", - "max": 1716416, + "layout": "IPY_MODEL_14db8797959a4253a36dd334f4826a00", + "max": 20, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_07ffb70db96a431cba11b328e65b312d", - "value": 1716416 + "style": "IPY_MODEL_94e3a14430f64fa9ab318f0e6c37a883", + "value": 20 } }, - "ef2ae2c4892247919876aa97f42a3561": { + "36359ad021e64e6eb733ae740d987e68": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -1125,13 +1126,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_ce339a8ee46c435493635d30526a8f35", + "layout": "IPY_MODEL_9b974c27ffe7444ab7f9683167a6988c", "placeholder": "​", - "style": "IPY_MODEL_2b4516e19f694d9c98a038ac222735be", - "value": " 1.72M/1.72M [00:00<00:00, 29.5MiB/s]" + "style": "IPY_MODEL_e13925b2a40c4dcf993c62f64d332272", + "value": " 120/120 [08:06<00:00,  4.12s/pipeline]" } }, - "e5482a1a80244f039aeadfd8fe87534f": { + "80d36d8a650f42d8a990e04014e937a1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1179,11 +1180,11 @@ "padding": null, "right": null, "top": null, - "visibility": null, + "visibility": "hidden", "width": null } }, - "1c213845078846779f14d6143da25162": { + "751c1855804a48ff9f0d6496a5e29506": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1235,7 +1236,7 @@ "width": null } }, - "805d5e39429847c893d38d02524b6a59": { + "4749064df71f48f4b947ba208acbac26": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -1250,7 +1251,7 @@ "description_width": "" } }, - "09e0bcda1a3645268fc0ec1b7540538f": { + "14db8797959a4253a36dd334f4826a00": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1302,7 +1303,7 @@ "width": null } }, - "07ffb70db96a431cba11b328e65b312d": { + "94e3a14430f64fa9ab318f0e6c37a883": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", @@ -1318,7 +1319,7 @@ "description_width": "" } }, - "ce339a8ee46c435493635d30526a8f35": { + "9b974c27ffe7444ab7f9683167a6988c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1370,7 +1371,7 @@ "width": null } }, - "2b4516e19f694d9c98a038ac222735be": { + "e13925b2a40c4dcf993c62f64d332272": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -1385,7 +1386,7 @@ "description_width": "" } }, - "e596dc64bc2145feb8039da7c626250d": { + "d143776d4b464a648b17ae8e280ecc39": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", @@ -1400,14 +1401,14 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_4f8021ec3c87454e856aa23426dd92f7", - "IPY_MODEL_abaf96bfaed843918b225cf759ab5f5d", - "IPY_MODEL_647e90686f1d49d8b6512e9144287cb7" + "IPY_MODEL_16a6911e48b142cf83401adb3788792f", + "IPY_MODEL_68f7ddcbed594c8c8a511c20be8d4acd", + "IPY_MODEL_4c3ff39abc4b4f0baeee0538ce75d76f" ], - "layout": "IPY_MODEL_5ccef87811f84ba69ff589e4dd0a0db1" + "layout": "IPY_MODEL_1f8d747cea354729af6b532fb2a23a97" } }, - "4f8021ec3c87454e856aa23426dd92f7": { + "16a6911e48b142cf83401adb3788792f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -1422,13 +1423,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_3811f9b18eb94a998715a71e1d80c902", + "layout": "IPY_MODEL_f634d0b3e4c34f058ab52077ccb97407", "placeholder": "​", - "style": "IPY_MODEL_cd69e99568cc440fb9c70c0e1327c740", + "style": "IPY_MODEL_5df15aaaa49b425e827b57a427479f89", "value": "Optimization Progress: 100%" } }, - "abaf96bfaed843918b225cf759ab5f5d": { + "68f7ddcbed594c8c8a511c20be8d4acd": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", @@ -1444,15 +1445,15 @@ "bar_style": "", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_d6aeb9c1304c478aa713252980655782", + "layout": "IPY_MODEL_11ac4cb2fd7641e98779b027d47f2696", "max": 20, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_20a9e1000946406dbe45e5c0b92c88ab", + "style": "IPY_MODEL_d7a0b39588cb421492dde89492430a84", "value": 20 } }, - "647e90686f1d49d8b6512e9144287cb7": { + "4c3ff39abc4b4f0baeee0538ce75d76f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -1467,13 +1468,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_50d69c11c64942e7a8551479dbf89ba5", + "layout": "IPY_MODEL_fe991ac34e224d55b9c27ca1bad2808b", "placeholder": "​", - "style": "IPY_MODEL_e9a9ed09603a4cf38180372257723d1c", - "value": " 100/100 [05:58<00:00,  3.89s/pipeline]" + "style": "IPY_MODEL_ba7425e6aedf46f5adce1a7e8e2a7c4d", + "value": " 120/120 [07:32<00:00,  4.69s/pipeline]" } }, - "5ccef87811f84ba69ff589e4dd0a0db1": { + "1f8d747cea354729af6b532fb2a23a97": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1525,7 +1526,7 @@ "width": null } }, - "3811f9b18eb94a998715a71e1d80c902": { + "f634d0b3e4c34f058ab52077ccb97407": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1577,7 +1578,7 @@ "width": null } }, - "cd69e99568cc440fb9c70c0e1327c740": { + "5df15aaaa49b425e827b57a427479f89": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -1592,7 +1593,7 @@ "description_width": "" } }, - "d6aeb9c1304c478aa713252980655782": { + "11ac4cb2fd7641e98779b027d47f2696": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1644,7 +1645,7 @@ "width": null } }, - "20a9e1000946406dbe45e5c0b92c88ab": { + "d7a0b39588cb421492dde89492430a84": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", @@ -1660,7 +1661,7 @@ "description_width": "" } }, - "50d69c11c64942e7a8551479dbf89ba5": { + "fe991ac34e224d55b9c27ca1bad2808b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1712,7 +1713,7 @@ "width": null } }, - "e9a9ed09603a4cf38180372257723d1c": { + "ba7425e6aedf46f5adce1a7e8e2a7c4d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -1727,7 +1728,7 @@ "description_width": "" } }, - "33d9616f5fc34716b62b6b76940de17f": { + "2b1a6384b01945e8b5422f8033b35f3e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", @@ -1742,14 +1743,14 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_99c1326110284c378d90c411f95d2c41", - "IPY_MODEL_7d7228aabd874fe89402185e323a6d56", - "IPY_MODEL_20f007cefbca4a38a9887914f9b8f908" + "IPY_MODEL_ac9c8d71d04244faa9bc62060455bf18", + "IPY_MODEL_1fc73e630cf24f478dfbb3b8f2761383", + "IPY_MODEL_6e5336aa80094ee3826ba89335fbfd77" ], - "layout": "IPY_MODEL_44390d56d7734686b6dab24db04abd1c" + "layout": "IPY_MODEL_e8d44a1dded246ae94cd3413a216be4d" } }, - "99c1326110284c378d90c411f95d2c41": { + "ac9c8d71d04244faa9bc62060455bf18": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -1764,13 +1765,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_4f9eee0acf654464bae4a250bc78aec3", + "layout": "IPY_MODEL_d77493bbfb9d4197bc83de151f5398c7", "placeholder": "​", - "style": "IPY_MODEL_bbc3470e99a74d9385884ce080281aed", - "value": "Optimization Progress: 100%" + "style": "IPY_MODEL_627dcda5ea37455da054403897787bf2", + "value": "100%" } }, - "7d7228aabd874fe89402185e323a6d56": { + "1fc73e630cf24f478dfbb3b8f2761383": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", @@ -1783,18 +1784,18 @@ "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", - "bar_style": "", + "bar_style": "success", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_0a9b4564bbc24e11aed04594e38190fb", - "max": 20, + "layout": "IPY_MODEL_a6d24f5b014f4202805c31486fc656cb", + "max": 8039037, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_122fdca0c85449e0a3cad8828bcb67df", - "value": 20 + "style": "IPY_MODEL_8ab240c04e6e4ec5bc14918d47deffdf", + "value": 8039037 } }, - "20f007cefbca4a38a9887914f9b8f908": { + "6e5336aa80094ee3826ba89335fbfd77": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -1809,13 +1810,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_6ad9c2cfa49c4b0f9843d9afb3ef4bcc", + "layout": "IPY_MODEL_823df5cd8416469a9d826b118f741d6d", "placeholder": "​", - "style": "IPY_MODEL_34a265dfe03b4c31bad5c098d2a53a57", - "value": " 100/100 [04:26<00:00,  1.68s/pipeline]" + "style": "IPY_MODEL_2ac41eef5c6248c6ae773df951bd82d2", + "value": " 8.04M/8.04M [00:00<00:00, 43.4MiB/s]" } }, - "44390d56d7734686b6dab24db04abd1c": { + "e8d44a1dded246ae94cd3413a216be4d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1863,11 +1864,11 @@ "padding": null, "right": null, "top": null, - "visibility": "hidden", + "visibility": null, "width": null } }, - "4f9eee0acf654464bae4a250bc78aec3": { + "d77493bbfb9d4197bc83de151f5398c7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1919,7 +1920,7 @@ "width": null } }, - "bbc3470e99a74d9385884ce080281aed": { + "627dcda5ea37455da054403897787bf2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -1934,7 +1935,7 @@ "description_width": "" } }, - "0a9b4564bbc24e11aed04594e38190fb": { + "a6d24f5b014f4202805c31486fc656cb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -1986,7 +1987,7 @@ "width": null } }, - "122fdca0c85449e0a3cad8828bcb67df": { + "8ab240c04e6e4ec5bc14918d47deffdf": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", @@ -2002,7 +2003,7 @@ "description_width": "" } }, - "6ad9c2cfa49c4b0f9843d9afb3ef4bcc": { + "823df5cd8416469a9d826b118f741d6d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2054,7 +2055,7 @@ "width": null } }, - "34a265dfe03b4c31bad5c098d2a53a57": { + "2ac41eef5c6248c6ae773df951bd82d2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -2069,7 +2070,7 @@ "description_width": "" } }, - "66157afab32248dc8d040eebee0718d3": { + "b188bad6827047b4aa8d57facea61b3c": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", @@ -2084,14 +2085,14 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_740ad9c72d404dcea2affe9118a039e8", - "IPY_MODEL_4a9497a6f861409fbe56f64dabcc5aab", - "IPY_MODEL_8c143fc49ac0499a9144d4705ee42fff" + "IPY_MODEL_7b85fe2ab1314a27bf6f2715bcf2c253", + "IPY_MODEL_7c6ae3b2016a4974ad5d80a36fa59c5f", + "IPY_MODEL_72ad35f43fcb414f85a72baea10ceaee" ], - "layout": "IPY_MODEL_883189238c3f4928b35158622649598a" + "layout": "IPY_MODEL_f6087d8ae5584ed8805f20655307c694" } }, - "740ad9c72d404dcea2affe9118a039e8": { + "7b85fe2ab1314a27bf6f2715bcf2c253": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -2106,13 +2107,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_46e1ce81b2e54ee5936120bc892e8ddd", + "layout": "IPY_MODEL_c3f1706f0a6643abbe02debdccc80d16", "placeholder": "​", - "style": "IPY_MODEL_ec201971aa684b97acc1a9ca17f9d180", - "value": "Optimization Progress: 100%" + "style": "IPY_MODEL_b2023b49e067416a8712a878187ae26a", + "value": "100%" } }, - "4a9497a6f861409fbe56f64dabcc5aab": { + "7c6ae3b2016a4974ad5d80a36fa59c5f": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", @@ -2125,18 +2126,18 @@ "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", - "bar_style": "", + "bar_style": "success", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_c7900ee411fb4d719907d466ebd9c96f", - "max": 20, + "layout": "IPY_MODEL_d836a994fe684de792bffea9d47097b0", + "max": 1716416, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_71ac2675223046a893072a3ffc361d05", - "value": 20 + "style": "IPY_MODEL_e72b3600beb84517a48e51558e681ecb", + "value": 1716416 } }, - "8c143fc49ac0499a9144d4705ee42fff": { + "72ad35f43fcb414f85a72baea10ceaee": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -2151,13 +2152,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_743e80c2b3bb4269a095f6be3b7292ae", + "layout": "IPY_MODEL_4ea0250740634945be80f5352bee2b28", "placeholder": "​", - "style": "IPY_MODEL_92088f9533bb48ccb63bf9328304b0e3", - "value": " 100/100 [05:33<00:00,  2.67s/pipeline]" + "style": "IPY_MODEL_c1290b54ba5d405fad098180f83b7a5a", + "value": " 1.72M/1.72M [00:00<00:00, 30.2MiB/s]" } }, - "883189238c3f4928b35158622649598a": { + "f6087d8ae5584ed8805f20655307c694": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2205,11 +2206,11 @@ "padding": null, "right": null, "top": null, - "visibility": "hidden", + "visibility": null, "width": null } }, - "46e1ce81b2e54ee5936120bc892e8ddd": { + "c3f1706f0a6643abbe02debdccc80d16": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2261,7 +2262,7 @@ "width": null } }, - "ec201971aa684b97acc1a9ca17f9d180": { + "b2023b49e067416a8712a878187ae26a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -2276,7 +2277,7 @@ "description_width": "" } }, - "c7900ee411fb4d719907d466ebd9c96f": { + "d836a994fe684de792bffea9d47097b0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2328,7 +2329,7 @@ "width": null } }, - "71ac2675223046a893072a3ffc361d05": { + "e72b3600beb84517a48e51558e681ecb": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", @@ -2344,7 +2345,7 @@ "description_width": "" } }, - "743e80c2b3bb4269a095f6be3b7292ae": { + "4ea0250740634945be80f5352bee2b28": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2396,7 +2397,7 @@ "width": null } }, - "92088f9533bb48ccb63bf9328304b0e3": { + "c1290b54ba5d405fad098180f83b7a5a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -2411,7 +2412,7 @@ "description_width": "" } }, - "873451c433c1441796e0f2c582b54edc": { + "a720f2592e3d4bef98a4c6f5077f277f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", @@ -2426,14 +2427,14 @@ "_view_name": "HBoxView", "box_style": "", "children": [ - "IPY_MODEL_12bd969e54e64b17b22957ca3da533fa", - "IPY_MODEL_4dd1615acf4645d7861226a3da60ae27", - "IPY_MODEL_b5c015fcd9bd4461885e5e5c7f49fa9a" + "IPY_MODEL_d2ae2ec449c14266b009f213df117da2", + "IPY_MODEL_979b4554ad0b474b883040d9bbd51274", + "IPY_MODEL_be36aefa1b3e4ea6a776e02c082c9958" ], - "layout": "IPY_MODEL_c46835dbd92c49d6aef630a6d74f8588" + "layout": "IPY_MODEL_7fc5e7ea894245579a1abdeed4492954" } }, - "12bd969e54e64b17b22957ca3da533fa": { + "d2ae2ec449c14266b009f213df117da2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -2448,13 +2449,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_8bf87a1e0740469cbea6f0c76dd37735", + "layout": "IPY_MODEL_88b18cc3b9cf4eaa8edc5b16aaf28821", "placeholder": "​", - "style": "IPY_MODEL_cc4bbcf9f0c44cd69117dc6251954049", + "style": "IPY_MODEL_d5935c9201ed471ab2385df6e607380c", "value": "Optimization Progress: 100%" } }, - "4dd1615acf4645d7861226a3da60ae27": { + "979b4554ad0b474b883040d9bbd51274": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", @@ -2470,15 +2471,15 @@ "bar_style": "", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_b988186cafdf4da2a30bcff8a6a00f70", + "layout": "IPY_MODEL_4aa0b28b2aa44951aad3bfe37fa20051", "max": 20, "min": 0, "orientation": "horizontal", - "style": "IPY_MODEL_f998e026eba0488288515e8b69eacc96", + "style": "IPY_MODEL_43490d899aed4338823374634d05c9f1", "value": 20 } }, - "b5c015fcd9bd4461885e5e5c7f49fa9a": { + "be36aefa1b3e4ea6a776e02c082c9958": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", @@ -2493,13 +2494,13 @@ "_view_name": "HTMLView", "description": "", "description_tooltip": null, - "layout": "IPY_MODEL_0df60724539b4708a9323dbb479fa044", + "layout": "IPY_MODEL_512a81b06bd54811bd8b9f80958fa7a4", "placeholder": "​", - "style": "IPY_MODEL_dad976092fff4859aaf73eb9d85aa01b", - "value": " 120/120 [07:16<00:00,  3.02s/pipeline]" + "style": "IPY_MODEL_201d44e85f374a06929ae8db5851d61b", + "value": " 120/120 [08:06<00:00,  3.01s/pipeline]" } }, - "c46835dbd92c49d6aef630a6d74f8588": { + "7fc5e7ea894245579a1abdeed4492954": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2551,7 +2552,7 @@ "width": null } }, - "8bf87a1e0740469cbea6f0c76dd37735": { + "88b18cc3b9cf4eaa8edc5b16aaf28821": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2603,7 +2604,7 @@ "width": null } }, - "cc4bbcf9f0c44cd69117dc6251954049": { + "d5935c9201ed471ab2385df6e607380c": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -2618,7 +2619,7 @@ "description_width": "" } }, - "b988186cafdf4da2a30bcff8a6a00f70": { + "4aa0b28b2aa44951aad3bfe37fa20051": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2670,7 +2671,7 @@ "width": null } }, - "f998e026eba0488288515e8b69eacc96": { + "43490d899aed4338823374634d05c9f1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", @@ -2686,7 +2687,7 @@ "description_width": "" } }, - "0df60724539b4708a9323dbb479fa044": { + "512a81b06bd54811bd8b9f80958fa7a4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", @@ -2738,1033 +2739,7 @@ "width": null } }, - "dad976092fff4859aaf73eb9d85aa01b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4b8ebee65b2e4789bce617bedb691cb7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_a2a4e23c74fb4892be3f9edd122efd35", - "IPY_MODEL_1abae52fd84841759c034f9fb78c65a3", - "IPY_MODEL_36359ad021e64e6eb733ae740d987e68" - ], - "layout": "IPY_MODEL_80d36d8a650f42d8a990e04014e937a1" - } - }, - "a2a4e23c74fb4892be3f9edd122efd35": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_751c1855804a48ff9f0d6496a5e29506", - "placeholder": "​", - "style": "IPY_MODEL_4749064df71f48f4b947ba208acbac26", - "value": "Optimization Progress: 100%" - } - }, - "1abae52fd84841759c034f9fb78c65a3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_14db8797959a4253a36dd334f4826a00", - "max": 20, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_94e3a14430f64fa9ab318f0e6c37a883", - "value": 20 - } - }, - "36359ad021e64e6eb733ae740d987e68": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_9b974c27ffe7444ab7f9683167a6988c", - "placeholder": "​", - "style": "IPY_MODEL_e13925b2a40c4dcf993c62f64d332272", - "value": " 120/120 [08:06<00:00,  4.12s/pipeline]" - } - }, - "80d36d8a650f42d8a990e04014e937a1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": "hidden", - "width": null - } - }, - "751c1855804a48ff9f0d6496a5e29506": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "4749064df71f48f4b947ba208acbac26": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "14db8797959a4253a36dd334f4826a00": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "94e3a14430f64fa9ab318f0e6c37a883": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "9b974c27ffe7444ab7f9683167a6988c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e13925b2a40c4dcf993c62f64d332272": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d143776d4b464a648b17ae8e280ecc39": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_16a6911e48b142cf83401adb3788792f", - "IPY_MODEL_68f7ddcbed594c8c8a511c20be8d4acd", - "IPY_MODEL_4c3ff39abc4b4f0baeee0538ce75d76f" - ], - "layout": "IPY_MODEL_1f8d747cea354729af6b532fb2a23a97" - } - }, - "16a6911e48b142cf83401adb3788792f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_f634d0b3e4c34f058ab52077ccb97407", - "placeholder": "​", - "style": "IPY_MODEL_5df15aaaa49b425e827b57a427479f89", - "value": "Optimization Progress: 100%" - } - }, - "68f7ddcbed594c8c8a511c20be8d4acd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_11ac4cb2fd7641e98779b027d47f2696", - "max": 20, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_d7a0b39588cb421492dde89492430a84", - "value": 20 - } - }, - "4c3ff39abc4b4f0baeee0538ce75d76f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fe991ac34e224d55b9c27ca1bad2808b", - "placeholder": "​", - "style": "IPY_MODEL_ba7425e6aedf46f5adce1a7e8e2a7c4d", - "value": " 120/120 [07:32<00:00,  4.69s/pipeline]" - } - }, - "1f8d747cea354729af6b532fb2a23a97": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": "hidden", - "width": null - } - }, - "f634d0b3e4c34f058ab52077ccb97407": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "5df15aaaa49b425e827b57a427479f89": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "11ac4cb2fd7641e98779b027d47f2696": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d7a0b39588cb421492dde89492430a84": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "fe991ac34e224d55b9c27ca1bad2808b": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ba7425e6aedf46f5adce1a7e8e2a7c4d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "92ce081546c945089cdd8b00604f9190": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_cfb57c0da6cf422f8e56a68fd2fa8620", - "IPY_MODEL_fa57f776a9044f928368ff7299e01bd9", - "IPY_MODEL_2f1b25096e404d069ef24ee45f236a64" - ], - "layout": "IPY_MODEL_08bb4312d1a948e986a03788d286dda1" - } - }, - "cfb57c0da6cf422f8e56a68fd2fa8620": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_17cc427fd9e9467e999b9164774b6eb5", - "placeholder": "​", - "style": "IPY_MODEL_db4793323d154fdba9d4d88d5789d8dd", - "value": "Optimization Progress: 100%" - } - }, - "fa57f776a9044f928368ff7299e01bd9": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_cd4d7a2f6adc4f69b07b1b1230153d44", - "max": 20, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_18ff42e892cf489fb6fa9003fbdecc6c", - "value": 20 - } - }, - "2f1b25096e404d069ef24ee45f236a64": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_fcf7b83de17d491b8729ce64d8f3cede", - "placeholder": "​", - "style": "IPY_MODEL_695f315cc88c477084e7737b132ed93a", - "value": " 120/120 [03:08<00:00,  1.76s/pipeline]" - } - }, - "08bb4312d1a948e986a03788d286dda1": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": "hidden", - "width": null - } - }, - "17cc427fd9e9467e999b9164774b6eb5": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "db4793323d154fdba9d4d88d5789d8dd": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "cd4d7a2f6adc4f69b07b1b1230153d44": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "18ff42e892cf489fb6fa9003fbdecc6c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "fcf7b83de17d491b8729ce64d8f3cede": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "695f315cc88c477084e7737b132ed93a": { + "201d44e85f374a06929ae8db5851d61b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", @@ -3794,7 +2769,102 @@ }, { "cell_type": "markdown", - "source": [], + "source": [ + ">[AutoML with TPOT using Adversary Emulation (C2) as datagen - Supervised and Deep Learning with Genetic Programming](#scrollTo=YJeseKzwjOVm)\n", + "\n", + ">[Installations](#scrollTo=Fv4KCLz9j4Zc)\n", + "\n", + ">[Tpot can use RAPIDS for better GPU support](#scrollTo=67jTk-_N42-A)\n", + "\n", + ">[Initialize GitHub authentication (token)](#scrollTo=OQTe2IE95P7c)\n", + "\n", + ">[Data download: captured Sysmon logs from the AE lab](#scrollTo=oYVNy4rNojZc)\n", + "\n", + ">>[ML development Objectives](#scrollTo=oYVNy4rNojZc)\n", + "\n", + ">[Download pre-trained BPE Tokenizer from the project](#scrollTo=Uyn6jyZvMsQI)\n", + "\n", + ">[ETL and labeling](#scrollTo=qM4Ym_5Zon4w)\n", + "\n", + ">[Sorting the data to get a timeline](#scrollTo=tKBQ5qsaBH02)\n", + "\n", + ">[Denoised vector generation through message filtering](#scrollTo=cYjpfMq1qKb1)\n", + "\n", + ">[Data exploration: create extra tables for labelling](#scrollTo=E_THJfYFmgLc)\n", + "\n", + ">[Qualitative behaviour analysis: temporary folder on Windows 10](#scrollTo=IuK7hfb_mq8e)\n", + "\n", + ">[Quantitative behaviour analysis: How many \"bad\" conditions did this yield?](#scrollTo=mVu5858Xm8qU)\n", + "\n", + ">[Store the data as CSV and JSON](#scrollTo=BGPg1RDmxMVE)\n", + "\n", + ">[Labeling based on Python](#scrollTo=GgFNrU80xQsF)\n", + "\n", + ">[Check of the Adversary Emulation data: a simulation dataset for InfoSec](#scrollTo=MES2a9BdnkU4)\n", + "\n", + ">[Vectorization with Linformer](#scrollTo=9-1dkT2mQHvJ)\n", + "\n", + ">[Example messages from the two classes (for a single-label classifier)](#scrollTo=91khWgQT3DVa)\n", + "\n", + ">[Example messages from the two classes (for a single-label classifier) (pandas framework code)](#scrollTo=ZY-8c7Q_3mEg)\n", + "\n", + ">[Vectorize text column in the DataFrame](#scrollTo=h4cuBLWrSNVs)\n", + "\n", + ">[The following code performs the high-dimensional vector projection with the Linformer model](#scrollTo=kSVB7YH-4czH)\n", + "\n", + ">[Saving the vectorized dataset as Parquet](#scrollTo=ScKF9UeB66LK)\n", + "\n", + ">[Import the data](#scrollTo=7LTDH-8RecLy)\n", + "\n", + ">[Kaggle download: log2ml dataset with Adversary Emulation data](#scrollTo=zmJzQP08sFfd)\n", + "\n", + ">[Import the parquet file into polars with pandas](#scrollTo=g7Tzv9jXsLWs)\n", + "\n", + ">[Load data into pandas](#scrollTo=_LydloD7sczq)\n", + "\n", + ">[AutoML with TPOT (Supervised Learning)](#scrollTo=ZqGIRBU6aE3U)\n", + "\n", + ">[Sparse vector data](#scrollTo=0Xxcb4ITMAH_)\n", + "\n", + ">[TPOT for Supervised Learning with Algo stats](#scrollTo=Gf4Fu4cDs4pr)\n", + "\n", + ">[TPOT Algo Stats: Supervised Learning](#scrollTo=KYJ1AkAStACV)\n", + "\n", + ">[Model export: TPOT's best pipeline, Supervised](#scrollTo=vM6TVwHYtKNa)\n", + "\n", + ">[Evaluation of the model](#scrollTo=lqXIlzZ0u7oN)\n", + "\n", + ">[Handling Imbalanced Data](#scrollTo=RG5bRw2lt0GD)\n", + "\n", + ">[Testing the predictions](#scrollTo=zCJ3jHyZQIS_)\n", + "\n", + ">[Save the Supervised Learning Model (joblib)](#scrollTo=YVPPGLSA6uZ-)\n", + "\n", + ">[Confusion matrix](#scrollTo=l1u70hEaAYgD)\n", + "\n", + ">[CV scores](#scrollTo=0jtLBuvnAeND)\n", + "\n", + ">[Grid search optimization](#scrollTo=_XUIxpRdKzf9)\n", + "\n", + ">[Tpot (Supervised)](#scrollTo=ZWRH59wKLaAX)\n", + "\n", + ">[Building a NN classifier with Tpot for an imbalanced dataset](#scrollTo=PEWJBJqf15m-)\n", + "\n", + ">[Using cuML for GPU accelerated model gen](#scrollTo=QJQXi06FdUDN)\n", + "\n" + ], + "metadata": { + "colab_type": "toc", + "id": "rObtGEu1R21B" + } + }, + { + "cell_type": "markdown", + "source": [ + "For long-running tasks it's possible to keep Colab from disconnecting (July 2024):\n", + "\n", + "https://gist.github.com/pouyaardehkhani/29a59270801a209d4960e2aefe648bbc" + ], "metadata": { "id": "wUrhwXsDkETs" } @@ -3802,7 +2872,7 @@ { "cell_type": "markdown", "source": [ - "# Installations\n", + "# Installation of depdencies (Google Colab)\n", "\n", "The installation is mostly automated.\n", "\n", @@ -3811,7 +2881,9 @@ "The file has one line:\n", "\n", "\n", - "`echo \"GITHUB_PERSONAL_ACCESS_TOKEN=\"ghp_...\" > thesis_ro`" + "`echo \"GITHUB_PERSONAL_ACCESS_TOKEN=\"ghp_...\" > thesis_ro`\n", + "\n", + "Colab has a set of tools installed by default. If this Notebook is used outside of Colab, additional depenencies may be required." ], "metadata": { "id": "Fv4KCLz9j4Zc" @@ -3829,6 +2901,13 @@ "import os\n", "import subprocess\n", "\n", + "\"\"\"\n", + "ATTENTION: this runs a shell script without further confirmations.\n", + "Modifications to the systems may be required.\n", + "\n", + "Please use this in a lab environemt.\n", + "\"\"\"\n", + "\n", "IN_COLAB = 'google.colab' in sys.modules\n", "\n", "if not IN_COLAB:\n", @@ -3838,18 +2917,23 @@ " subprocess.run('''\n", " source <(curl -s https://raw.githubusercontent.com/norandom/log2ml/main/dependencies/install.sh)\n", " ''',\n", - " shell=True, check=True, executable='/bin/bash')\n", - "\n" + " shell=True, check=True, executable='/bin/bash')" ] }, { "cell_type": "markdown", "source": [ - "## Tpot can use RAPIDS for better GPU support\n", + "# Tpot can use RAPIDS for improvived GPU acceleration\n", "\n", "https://medium.com/rapids-ai/faster-automl-with-tpot-and-rapids-758455cd89e5\n", "\n", - "https://docs.rapids.ai/api/cuml/stable/" + "https://docs.rapids.ai/api/cuml/stable/\n", + "\n", + "https://docs.rapids.ai/deployment/stable/platforms/colab/\n", + "\n", + "* cuml should be version 24. This is only relevant for some accelerated GPU-only models, which Tpot can generate with the help of DEAP (GP)\n", + "\n", + "https://deap.readthedocs.io/en/master/" ], "metadata": { "id": "67jTk-_N42-A" @@ -3867,9 +2951,9 @@ "height": 35 }, "id": "0Gu0pYEO4pxy", - "outputId": "f48fb56b-f77e-4b7f-cd77-82bc70ad3e3d" + "outputId": "9bc36d8d-db34-4f45-c3ef-b5dd913db591" }, - "execution_count": 3, + "execution_count": 4, "outputs": [ { "output_type": "execute_result", @@ -3882,7 +2966,7 @@ } }, "metadata": {}, - "execution_count": 3 + "execution_count": 4 } ] }, @@ -3905,6 +2989,14 @@ "from dotenv import load_dotenv\n", "import os\n", "\n", + "\"\"\"\n", + "Please ensure that you place the token into the file \"thesis_ro\".\n", + "Use the same folder, or modify the code.\n", + "Do not echo the token, and save the notebook.\n", + "Otherwise the token leaks.\n", + "\"\"\"\n", + "\n", + "\n", "load_dotenv(\"thesis_ro\", verbose=True) # take environment variables from the file\n", "token = os.getenv('GITHUB_PERSONAL_ACCESS_TOKEN')\n", "if len(token) > 0:\n", @@ -3917,9 +3009,9 @@ "base_uri": "https://localhost:8080/" }, "id": "vyKWa35bkFcG", - "outputId": "afd6da24-fb42-48e7-bece-8697870b2cb8" + "outputId": "39c78aeb-bcd9-4bfe-aa8e-d56b4e819eec" }, - "execution_count": 21, + "execution_count": 20, "outputs": [ { "output_type": "stream", @@ -3937,7 +3029,7 @@ "\n", "These samples contain Sysmon log activity of Dropper Malware (C2 Dropper, MS Excel VBA, Covenant).\n", "\n", - "No full AE campaigns with post-exploitation, just the Dropper itself.\n", + "No full Adversary Emulation campaigns with post-exploitation, just the Dropper itself.\n", "\n", "The set contains security agent telemetry (aka sysmon lohs) of 1000 documents, some malicious and some not. These are imbalanced datasets.\n", "\n", @@ -3945,10 +3037,10 @@ "\n", "## ML development Objectives\n", "\n", - "1. Classify: which ones are malicious?\n", - "2. Explore: how does the VBA Excel malware behave? \n", - "3. Evaluate: Can ML help to find out?\n", - "4. Engineer: can Tpot autogen the ML models?" + "1. **Explore**: how does the VBA Excel malware behave? \n", + "2. **Classify**: which ones are malicious?\n", + "3. **Evaluate**: Can ML help to find out?\n", + "4. **Engineer**: can Tpot autogen the ML models?" ], "metadata": { "id": "oYVNy4rNojZc" @@ -4009,14 +3101,32 @@ " if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:\n", " print(\"ERROR, something went wrong\")\n", " else:\n", - " print(f\"File downloaded successfully and saved as {save_path}\")\n", - "\n", + " print(f\"File downloaded successfully and saved as {save_path}\")" + ], + "metadata": { + "id": "5EEoa3gUmFpn" + }, + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ "# Your GitHub token\n", "github_token = token\n", "\n", "# Repository name\n", - "repository_name = \"norandom/log2ml\"\n", - "\n", + "repository_name = \"norandom/log2ml\"" + ], + "metadata": { + "id": "wwb-mj2pUAz_" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ "# File name to search for\n", "file_name = \"lab_logs_blindtest_activity_sysmon_1000samples_july_28_2024.csv\"\n", "\n", @@ -4036,23 +3146,23 @@ "base_uri": "https://localhost:8080/", "height": 85, "referenced_widgets": [ - "2cb39463ffff4f9f8c6c6ece6f0e2769", - "c51ec13acbd346de98bf66fd543c8ddd", - "b325e19f0a7c4baebd5660a78e45131a", - "3878d666a20c45d995dfc79b02c2c165", - "e2aee7171ace4dc0bb8cd09220db5d12", - "48622890a0254e49b2d26af09626560f", - "bcc8ab2a76e54b20a36e8b953a78f0e4", - "4dcbd8944dcc4ea1b6992ad7740da36e", - "961a05a5d5b446699f82905129b7cd19", - "8454cae7dcf54932a356ad972733775f", - "5a0d14012c40493081d1cca56b5ffde2" + "2b1a6384b01945e8b5422f8033b35f3e", + "ac9c8d71d04244faa9bc62060455bf18", + "1fc73e630cf24f478dfbb3b8f2761383", + "6e5336aa80094ee3826ba89335fbfd77", + "e8d44a1dded246ae94cd3413a216be4d", + "d77493bbfb9d4197bc83de151f5398c7", + "627dcda5ea37455da054403897787bf2", + "a6d24f5b014f4202805c31486fc656cb", + "8ab240c04e6e4ec5bc14918d47deffdf", + "823df5cd8416469a9d826b118f741d6d", + "2ac41eef5c6248c6ae773df951bd82d2" ] }, - "id": "5EEoa3gUmFpn", - "outputId": "c2f808b1-a2f2-455d-e3cd-939658d5b658" + "id": "yn8bBu9tT5_W", + "outputId": "00ecb757-1f83-49a5-a8fd-02fe855961ea" }, - "execution_count": 4, + "execution_count": 8, "outputs": [ { "output_type": "stream", @@ -4070,7 +3180,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "2cb39463ffff4f9f8c6c6ece6f0e2769" + "model_id": "2b1a6384b01945e8b5422f8033b35f3e" } }, "metadata": {} @@ -4091,7 +3201,9 @@ "\n", "The BPE tokenizer is used for the Sysmon messages, not for the entire JSON or CSV files.\n", "\n", - "https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0" + "https://towardsdatascience.com/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0\n", + "\n", + "The tokenizer has been trained on the messages." ], "metadata": { "id": "Uyn6jyZvMsQI" @@ -4119,23 +3231,23 @@ "base_uri": "https://localhost:8080/", "height": 85, "referenced_widgets": [ - "6043ef5c70a3493d8f45d8231726d2a6", - "317e803897b64bc58bb4b7d9d0ef2504", - "97270bdece1442719201203cfe72f262", - "ef2ae2c4892247919876aa97f42a3561", - "e5482a1a80244f039aeadfd8fe87534f", - "1c213845078846779f14d6143da25162", - "805d5e39429847c893d38d02524b6a59", - "09e0bcda1a3645268fc0ec1b7540538f", - "07ffb70db96a431cba11b328e65b312d", - "ce339a8ee46c435493635d30526a8f35", - "2b4516e19f694d9c98a038ac222735be" + "b188bad6827047b4aa8d57facea61b3c", + "7b85fe2ab1314a27bf6f2715bcf2c253", + "7c6ae3b2016a4974ad5d80a36fa59c5f", + "72ad35f43fcb414f85a72baea10ceaee", + "f6087d8ae5584ed8805f20655307c694", + "c3f1706f0a6643abbe02debdccc80d16", + "b2023b49e067416a8712a878187ae26a", + "d836a994fe684de792bffea9d47097b0", + "e72b3600beb84517a48e51558e681ecb", + "4ea0250740634945be80f5352bee2b28", + "c1290b54ba5d405fad098180f83b7a5a" ] }, "id": "PA-8JXLyn4Uv", - "outputId": "08639279-a742-40ca-b95e-f9cfe9d7b58e" + "outputId": "67390f4f-b3a0-4f44-f80c-c9afc008910f" }, - "execution_count": 5, + "execution_count": 22, "outputs": [ { "output_type": "stream", @@ -4153,7 +3265,7 @@ "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, - "model_id": "6043ef5c70a3493d8f45d8231726d2a6" + "model_id": "b188bad6827047b4aa8d57facea61b3c" } }, "metadata": {} @@ -4172,11 +3284,11 @@ "source": [ "# ETL and labeling\n", "\n", - "This uses Polars as an ETL (Extract Transform Load) tool.\n", + "This uses Polars as an ETL (**Extract Transform Load**) tool. This concerns in-memory processing of data tables, based on DataFrames. These are very popular in DataScience, because many tools can consume DataFrames.\n", "\n", "https://pola.rs/\n", "\n", - "Pandas is used later, because it's more established and compatible to Tpot, which is used for Genetic Programming AutoML; in this sense for hyperparameter optimization and automated feature selection.\n", + "Pandas is used later, because it's more established and compatible to Tpot, which is used for Genetic Programming AutoML; for hyperparameter optimization and automated feature selection.\n", "\n", "https://pandas.pydata.org/" ], @@ -4202,7 +3314,7 @@ "base_uri": "https://localhost:8080/" }, "id": "SCKEWW-aoNDQ", - "outputId": "ce3d8ea3-0f89-487f-835a-15365a46dff8" + "outputId": "ecb8cbda-d26e-44a0-f5ee-5cd1ea57a969" }, "execution_count": 6, "outputs": [ @@ -4258,7 +3370,7 @@ { "cell_type": "markdown", "source": [ - "## Sorting the data\n", + "# Sorting the data to get a timeline\n", "\n", "The data was received from the Elasticsearch DB.\n", "\n", @@ -4290,7 +3402,7 @@ { "cell_type": "markdown", "source": [ - "## Denoised vector generation\n", + "# Denoised vector generation through message filtering\n", "\n", "Here system specific strings get removed. This improves the transferability.\n", "Sysmon creates very detailed logs.\n", @@ -4357,7 +3469,7 @@ "base_uri": "https://localhost:8080/" }, "id": "_CJUqGQUqFew", - "outputId": "4aba53a3-25a2-4a49-8a4b-89f4ec5b1783" + "outputId": "0fe25e96-12d1-419f-bb6a-626312d2441b" }, "execution_count": 8, "outputs": [ @@ -4421,9 +3533,9 @@ { "cell_type": "markdown", "source": [ - "## Data exploration\n", + "# Data exploration: create extra tables for labelling\n", "\n", - "The following data exploration is part of the analytic process." + "The following data exploration is part of the analytic process, which aims to produce labelled data for the Supervised Learning approach." ], "metadata": { "id": "E_THJfYFmgLc" @@ -4471,9 +3583,9 @@ "base_uri": "https://localhost:8080/" }, "id": "WBy3Rqj_orz_", - "outputId": "303db2d0-8fac-48be-b275-5d58409c70c2" + "outputId": "60dbbef7-0753-45af-b6df-379880b63347" }, - "execution_count": 9, + "execution_count": 11, "outputs": [ { "output_type": "stream", @@ -4517,34 +3629,34 @@ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Ut… │\n", "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n", "shape: (41, 2)\n", - "┌───────────────────────────────────┬───────┐\n", - "│ parent_image ┆ count │\n", - "│ --- ┆ --- │\n", - "│ str ┆ u32 │\n", - "╞═══════════════════════════════════╪═══════╡\n", - "│ PLUGScheduler.exe ┆ 2 │\n", - "│ MpCmdRun.exe ┆ 1 │\n", - "│ MicrosoftEdge_X64_127.0.2651.74_… ┆ 1 │\n", - "│ csc.exe ┆ 1 │\n", - "│ … ┆ … │\n", - "│ upfc.exe ┆ 1 │\n", - "│ overseer.exe ┆ 4 │\n", - "│ pip.exe ┆ 9 │\n", - "│ smss.exe ┆ 3 │\n", - "└───────────────────────────────────┴───────┘\n", + "┌─────────────────────┬───────┐\n", + "│ parent_image ┆ count │\n", + "│ --- ┆ --- │\n", + "│ str ┆ u32 │\n", + "╞═════════════════════╪═══════╡\n", + "│ python.exe ┆ 1044 │\n", + "│ runonce.exe ┆ 1 │\n", + "│ Integrator.exe ┆ 10 │\n", + "│ CompatTelRunner.exe ┆ 2 │\n", + "│ … ┆ … │\n", + "│ AvastUI.exe ┆ 6 │\n", + "│ AvLaunch.exe ┆ 1 │\n", + "│ upfc.exe ┆ 1 │\n", + "│ winlogon.exe ┆ 8 │\n", + "└─────────────────────┴───────┘\n", "shape: (91, 2)\n", "┌───────────────────────────────────┬───────┐\n", "│ target_filename ┆ count │\n", "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞═══════════════════════════════════╪═══════╡\n", - "│ C:\\Program Files (x86)\\Microsoft… ┆ 1 │\n", + "│ C:\\Users\\student\\AppData\\Local\\M… ┆ 1 │\n", "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", - "│ C:\\Program Files (x86)\\Microsoft… ┆ 1 │\n", "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", + "│ C:\\Program Files (x86)\\Microsoft… ┆ 1 │\n", "│ … ┆ … │\n", "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", - "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", + "│ C:\\Program Files (x86)\\Microsoft… ┆ 1 │\n", "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", "│ C:\\Users\\student\\AppData\\Local\\T… ┆ 1 │\n", "└───────────────────────────────────┴───────┘\n" @@ -4555,7 +3667,7 @@ { "cell_type": "markdown", "source": [ - "## Behavioural analysis: temporary folder on Windows 10\n", + "# Qualitative behaviour analysis: temporary folder on Windows 10\n", "\n", "It seems that suspicious activity involves temporary folders. A list of the temp folders is used to define a condition." ], @@ -4618,9 +3730,9 @@ "base_uri": "https://localhost:8080/" }, "id": "b0wLxKfzsg1e", - "outputId": "a08e43b0-a259-4112-b9f6-b62d126a9425" + "outputId": "83ec6ca7-1ef3-4486-a2b9-4cc1845b18e9" }, - "execution_count": 10, + "execution_count": 12, "outputs": [ { "output_type": "stream", @@ -4648,8 +3760,8 @@ "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞═════════════╪═══════╡\n", - "│ No ┆ 13289 │\n", "│ Yes ┆ 166 │\n", + "│ No ┆ 13289 │\n", "└─────────────┴───────┘\n" ] } @@ -4658,7 +3770,7 @@ { "cell_type": "markdown", "source": [ - "## How many \"bad\" conditions did this yield?" + "# Quantitative behaviour analysis: How many \"bad\" conditions did this yield?" ], "metadata": { "id": "mVu5858Xm8qU" @@ -4675,9 +3787,9 @@ "base_uri": "https://localhost:8080/" }, "id": "uj7NV2wGPHi2", - "outputId": "f4baf9a2-4dbd-4fa1-fd61-8529159cd4fa" + "outputId": "59ffa6bd-ce49-43ab-92e0-67e72adc21c7" }, - "execution_count": 11, + "execution_count": 13, "outputs": [ { "output_type": "stream", @@ -4691,11 +3803,11 @@ { "cell_type": "markdown", "source": [ - "## Store the data as CSV and JSON\n", + "# Store the data as CSV and JSON\n", "\n", "CSV and JSON are popular data format standards.\n", "\n", - "The multiline CSV can be opened with vim, and searched. For the JSON this is harder." + "The multiline CSV can be opened with `vim`, and searched. For the JSON this is harder, but there are command-line utils like `jq`. The analysis focuses on the CSV, but the JSON file is shared as well (https://github.com/norandom/log2ml/releases/tag/lab)" ], "metadata": { "id": "BGPg1RDmxMVE" @@ -4709,7 +3821,7 @@ "metadata": { "id": "ngGJUIsXu5fL" }, - "execution_count": 12, + "execution_count": 14, "outputs": [] }, { @@ -4720,7 +3832,7 @@ "metadata": { "id": "lRQS5xLTvpZv" }, - "execution_count": 13, + "execution_count": 15, "outputs": [] }, { @@ -4762,9 +3874,9 @@ "base_uri": "https://localhost:8080/" }, "id": "X5ptzPnfxP9M", - "outputId": "6bc45ef2-75a9-4055-e30d-83de249c8065" + "outputId": "0ac19338-925b-4d3e-c43f-baa98a138d82" }, - "execution_count": 14, + "execution_count": 16, "outputs": [ { "output_type": "stream", @@ -4800,8 +3912,8 @@ "│ --- ┆ --- │\n", "│ str ┆ u32 │\n", "╞═══════╪═══════╡\n", - "│ good ┆ 13341 │\n", "│ bad ┆ 114 │\n", + "│ good ┆ 13341 │\n", "└───────┴───────┘\n" ] } @@ -4810,7 +3922,7 @@ { "cell_type": "markdown", "source": [ - "## Simulation\n", + "# Check of the Adversary Emulation data: a simulation dataset for InfoSec\n", "\n", "The dataset contains security simulation (Adversary Emulation) data. A software robot opened 1000 files. file.exe was dropped to a temp folder." ], @@ -4843,7 +3955,7 @@ "id": "dRH3AReO_i5D", "outputId": "264a8f1e-5f67-4be4-88a5-a8962df60071" }, - "execution_count": 15, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -4878,7 +3990,9 @@ "\n", "https://ai.meta.com/blog/how-facebook-uses-super-efficient-ai-models-to-detect-hate-speech/\n", "\n", - "Facebook uses Linformer to detect hate speech, which is often" + "https://github.com/tatp22/linformer-pytorch\n", + "\n", + "Facebook uses Linformer to detect hate speech. In Social Networks hate speech isn't constructed like prosa literature, but contains slang and symbol characters as a means to obscure the meaning to outsiders. This indicates that Linformer model could potencially be useful for semi-natural language, which is also common for log- and telemetry data (in InfoSec9" ], "metadata": { "id": "9-1dkT2mQHvJ" @@ -4893,7 +4007,7 @@ "metadata": { "id": "_Vgi0rjn_RIn" }, - "execution_count": 16, + "execution_count": 11, "outputs": [] }, { @@ -4933,13 +4047,16 @@ "metadata": { "id": "SsW6vlQNQhAy" }, - "execution_count": 17, + "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ - "## The model can get fine-tuned." + "## The model can get fine-tuned. This isn't necessary, but it's possible for optimization.\n", + "## Many ML problems become optimization problems, once the models start to perform.\n", + "## AutoML (with GP) can accelerate the optimization, especially when it comes to balancing\n", + "## performance and precision." ], "metadata": { "id": "Q-Y1HQMqzN-5" @@ -4950,10 +4067,20 @@ { "cell_type": "markdown", "source": [ - "## Vectorize text column in the DataFrame" + "# Example messages from the two classes (for a single-label classifier)\n", + "\n", + "We have two classes:\n", + "\n", + "\n", + "1. good\n", + "2. bad\n", + "\n", + "\n", + "\n", + "The bad-class is a minority class. This is typical for domains such as InfoSec (or medicine etc.) where the identification of an abnormality needs to be performed. For ML handling minority class problems poses an extra challenge." ], "metadata": { - "id": "h4cuBLWrSNVs" + "id": "91khWgQT3DVa" } }, { @@ -4987,8 +4114,126 @@ " # Get a random index\n", " random_index = random.randint(0, good_messages.height - 1)\n", "\n", - " # Select the message at that random index\n", - " random_good_message = good_messages.row(random_index)[0]\n", + " # Select the message at that random index\n", + " random_good_message = good_messages.row(random_index)[0]\n", + " print(random_good_message)\n", + "else:\n", + " print(\"No good messages labeled found.\")" + ], + "metadata": { + "id": "4SguqPPKSdpJ", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "16c10c29-88d4-4639-f645-1480bab8a94c" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['@timestamp', 'host.hostname', 'host.ip', 'log.level', 'winlog.event_id', 'winlog.task', 'message', 'filtered_message', 'image', 'target_filename', 'parent_image', 'text', 'temp_folder', 'filename', 'label']\n", + "shape: (13_455, 2)\n", + "┌───────┬──────────────────────────────┐\n", + "│ label ┆ filtered_message │\n", + "│ --- ┆ --- │\n", + "│ str ┆ str │\n", + "╞═══════╪══════════════════════════════╡\n", + "│ good ┆ Network connection detected: │\n", + "│ ┆ Rul… │\n", + "│ good ┆ Network connection detected: │\n", + "│ ┆ Rul… │\n", + "│ good ┆ Process accessed: │\n", + "│ ┆ RuleName: - │\n", + "│ ┆ Ut… │\n", + "│ good ┆ Process accessed: │\n", + "│ ┆ RuleName: - │\n", + "│ ┆ Ut… │\n", + "│ … ┆ … │\n", + "│ good ┆ Process accessed: │\n", + "│ ┆ RuleName: - │\n", + "│ ┆ Ut… │\n", + "│ good ┆ Process accessed: │\n", + "│ ┆ RuleName: - │\n", + "│ ┆ Ut… │\n", + "│ good ┆ Process accessed: │\n", + "│ ┆ RuleName: - │\n", + "│ ┆ Ut… │\n", + "│ good ┆ Process Create: │\n", + "│ ┆ RuleName: - │\n", + "│ ┆ UtcT… │\n", + "└───────┴──────────────────────────────┘\n", + "File created:\n", + "RuleName: EXE\n", + "UtcTime: 2024-07-28 20:40:26.601\n", + "ProcessGuid: {18e8265a-acb5-66a6-fb0d-000000004400}\n", + "ProcessId: 2428\n", + "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", + "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", + "CreationUtcTime: 2024-07-23 14:24:50.520\n", + "\n", + "\n", + "\n", + "File created:\n", + "RuleName: DLL\n", + "UtcTime: 2024-07-28 15:23:00.514\n", + "ProcessGuid: {18e8265a-6254-66a6-e102-000000004400}\n", + "ProcessId: 10948\n", + "Image: C:\\Program Files\\Avast Software\\Avast\\Setup\\Instup.exe\n", + "TargetFilename: C:\\Program Files\\Avast Software\\Avast\\setup\\uat64.dll\n", + "CreationUtcTime: 2023-03-09 19:28:07.792\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Example messages from the two classes (for a single-label classifier) (pandas framework code)\n", + "\n", + "This is the same but with Pandas.\n", + "\n", + "The variables are the same:\n", + "\n", + "\n", + "```\n", + "random_bad_message = \"\"\n", + "random_good_message = \"\"\n", + "```\n", + "\n", + "The following code is needed if the data get downloaded from the release and not processed step by step." + ], + "metadata": { + "id": "ZY-8c7Q_3mEg" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import random\n", + "\n", + "# Assuming df_f is your pandas DataFrame\n", + "print(df_f.columns)\n", + "print(df_f[[\"label\", \"filtered_message\"]])\n", + "\n", + "bad_messages = df_f[df_f['label'] == 'bad']['filtered_message']\n", + "good_messages = df_f[df_f['label'] == 'good']['filtered_message']\n", + "\n", + "random_bad_message = \"\"\n", + "random_good_message = \"\"\n", + "\n", + "if not bad_messages.empty:\n", + " random_bad_message = bad_messages.sample().iloc[0]\n", + " print(random_bad_message)\n", + "else:\n", + " print(\"No bad messages labeled found.\")\n", + "\n", + "print(\"\\n\\n\")\n", + "\n", + "if not good_messages.empty:\n", + " random_good_message = good_messages.sample().iloc[0]\n", " print(random_good_message)\n", "else:\n", " print(\"No good messages labeled found.\")" @@ -4997,70 +4242,67 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "id": "4SguqPPKSdpJ", - "outputId": "1dda19c4-66e1-48e2-e3b5-e7ec63e68102" + "id": "cVZe3nmRBwc2", + "outputId": "dd316cbf-c3c3-44fd-9614-497382fc95b2" }, - "execution_count": 18, + "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ - "['@timestamp', 'host.hostname', 'host.ip', 'log.level', 'winlog.event_id', 'winlog.task', 'message', 'filtered_message', 'image', 'target_filename', 'parent_image', 'text', 'temp_folder', 'filename', 'label']\n", - "shape: (13_455, 2)\n", - "┌───────┬──────────────────────────────┐\n", - "│ label ┆ filtered_message │\n", - "│ --- ┆ --- │\n", - "│ str ┆ str │\n", - "╞═══════╪══════════════════════════════╡\n", - "│ good ┆ Network connection detected: │\n", - "│ ┆ Rul… │\n", - "│ good ┆ Network connection detected: │\n", - "│ ┆ Rul… │\n", - "│ good ┆ Process accessed: │\n", - "│ ┆ RuleName: - │\n", - "│ ┆ Ut… │\n", - "│ good ┆ Process accessed: │\n", - "│ ┆ RuleName: - │\n", - "│ ┆ Ut… │\n", - "│ … ┆ … │\n", - "│ good ┆ Process accessed: │\n", - "│ ┆ RuleName: - │\n", - "│ ┆ Ut… │\n", - "│ good ┆ Process accessed: │\n", - "│ ┆ RuleName: - │\n", - "│ ┆ Ut… │\n", - "│ good ┆ Process accessed: │\n", - "│ ┆ RuleName: - │\n", - "│ ┆ Ut… │\n", - "│ good ┆ Process Create: │\n", - "│ ┆ RuleName: - │\n", - "│ ┆ UtcT… │\n", - "└───────┴──────────────────────────────┘\n", + "Index(['@timestamp', 'host.hostname', 'host.ip', 'log.level',\n", + " 'winlog.event_id', 'winlog.task', 'message', 'filtered_message',\n", + " 'image', 'target_filename', 'parent_image', 'text', 'temp_folder',\n", + " 'filename', 'label', 'message_vector'],\n", + " dtype='object')\n", + " label filtered_message\n", + "0 good Network connection detected:\\nRuleName: -\\nUtc...\n", + "1 good Network connection detected:\\nRuleName: -\\nUtc...\n", + "2 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "3 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "4 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "... ... ...\n", + "13450 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "13451 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "13452 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "13453 good Process accessed:\\nRuleName: -\\nUtcTime: 2024-...\n", + "13454 good Process Create:\\nRuleName: -\\nUtcTime: 2024-07...\n", + "\n", + "[13455 rows x 2 columns]\n", "File created:\n", "RuleName: EXE\n", - "UtcTime: 2024-07-28 17:05:05.424\n", - "ProcessGuid: {18e8265a-7a3c-66a6-d507-000000004400}\n", - "ProcessId: 3736\n", + "UtcTime: 2024-07-28 15:53:22.557\n", + "ProcessGuid: {18e8265a-696d-66a6-7705-000000004400}\n", + "ProcessId: 10624\n", "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", "CreationUtcTime: 2024-07-23 14:24:50.520\n", "\n", "\n", "\n", - "Registry value set:\n", - "RuleName: InvDB-Path\n", - "EventType: SetValue\n", - "UtcTime: 2024-07-28 15:13:19.757\n", - "ProcessGuid: {18e8265a-5f49-66a6-2601-000000004400}\n", - "ProcessId: 10812\n", - "Image: C:\\Windows\\system32\\CompatTelRunner.exe\n", - "TargetObject: \\REGISTRY\\A\\{90cbbb87-bac4-4fa3-1d8b-b1a042a75259}\\Root\\InventoryApplicationFile\\msedge_pwa_launc|326a60d0d6b1ca83\\LowerCaseLongPath\n", - "Details: c:\\program files (x86)\\microsoft\\edgecore\\126.0.2592.113\\msedge_pwa_launcher.exe\n" + "Dns query:\n", + "RuleName: -\n", + "UtcTime: 2024-07-28 19:08:06.591\n", + "ProcessGuid: {18e8265a-9715-66a6-600b-000000004400}\n", + "ProcessId: 540\n", + "QueryName: messaging\n", + "QueryStatus: 0\n", + "QueryResults: type: 5 prod-campaignaggregator.omexexternallfb.office.net.akadns.net;::ffff:52.109.16.3;\n", + "Image: C:\\Program Files\\Microsoft Office\\root\\Office16\\EXCEL.EXE\n" ] } ] }, + { + "cell_type": "markdown", + "source": [ + "# Vectorize text column in the DataFrame" + ], + "metadata": { + "id": "h4cuBLWrSNVs" + } + }, { "cell_type": "code", "source": [ @@ -5074,7 +4316,8 @@ "\n", "# Define the device (assuming you're using PyTorch and want to specify CPU or GPU)\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "print(device)\n", + "\n", + "print(\"This uses a \" + str(device) + \" device\")\n", "\n", "def vectorize_text(text):\n", " MAX_LENGTH = 700 # Define the maximum length of tokens for the model\n", @@ -5096,35 +4339,74 @@ "\n", " # Assuming outputs is the tensor of interest\n", " vector = outputs.mean(dim=1).detach() # Detach the tensor from the GPU\n", - " return vector.cpu().numpy() # Move tensor back to CPU and convert to numpy\n", - "\n" + " return vector.cpu().numpy() # Move tensor back to CPU and convert to numpy" ], "metadata": { - "id": "EyvdFj83SQKI" + "id": "EyvdFj83SQKI", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "882141d8-1956-4f6b-ba33-b5dbe205e4af" }, - "execution_count": 19, - "outputs": [] + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "This uses a cuda device\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# The following code performs the high-dimensional vector projection with the Linformer model\n", + "\n", + "This can take some minutes on a GPU system.\n", + "On a CPU system the runtime is exponentially higher." + ], + "metadata": { + "id": "kSVB7YH-4czH" + } }, { "cell_type": "code", "source": [ + "%%time\n", "# Assuming df_f is a Polars DataFrame with a column \"filtered_message\"\n", "df_f = df_f.with_columns(\n", " pl.col(\"filtered_message\").map_elements(lambda x: vectorize_text(x).flatten(), return_dtype=pl.Object).alias(\"message_vector\")\n", ")" ], "metadata": { - "id": "doS16Nq4S37g" + "id": "doS16Nq4S37g", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4e11e7b6-60f1-4caa-e268-b2f64e1a5852" }, - "execution_count": 20, - "outputs": [] + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CPU times: user 2min 4s, sys: 1.05 s, total: 2min 5s\n", + "Wall time: 2min 5s\n" + ] + } + ] }, { "cell_type": "markdown", "source": [ - "## Saving the vectorized dataset as Parquet\n", + "# Saving the vectorized dataset as Parquet\n", "\n", - "This may require up to 32 GB RAM." + "This may require up to 25 GB RAM and 4 GB disk space. The vector data is very sparse (30k dims). That's not strictly necessary, but since we are going to reduce this with a PCA anyways it's not a big development challenge. The \"Curse of Dimensionality\" gets llustrated this way. Other models, such as LongFormer, produce less dimensions and may be more suitable form a development standpoint.\n", + "\n", + "The code uses a mixture of Pandas and Polars, because Polars isn't feature-complete." ], "metadata": { "id": "ScKF9UeB66LK" @@ -5159,7 +4441,7 @@ "metadata": { "id": "clEaIJjHXRhM" }, - "execution_count": 21, + "execution_count": 25, "outputs": [] }, { @@ -5172,9 +4454,9 @@ "base_uri": "https://localhost:8080/" }, "id": "iixa8gHy80B6", - "outputId": "cd9ead43-a885-423c-81a2-a7b8cfa4c997" + "outputId": "d38be1e4-0381-41e2-a553-a7a8a9b203d8" }, - "execution_count": 22, + "execution_count": 26, "outputs": [ { "output_type": "stream", @@ -5188,9 +4470,9 @@ { "cell_type": "markdown", "source": [ - "## Import the data\n", + "# Import the data\n", "\n", - "Setup kaggle.json before (download via the portal).\n", + "The vectorized file can't get uploaded as a GitHub release, because it's too big. As a side-objective the data has been shared to the Kaggle community:\n", "\n", "https://www.kaggle.com/datasets/mariusciepluch/log2ml-blindtest-maldoc-activity-capture" ], @@ -5201,9 +4483,9 @@ { "cell_type": "markdown", "source": [ - "## Kaggle download\n", + "# Kaggle download: log2ml dataset with Adversary Emulation data\n", "\n", - "The following code download the data using the kaggle util." + "The following code download the data using the kaggle util. Authentication isn't necessary, because this is a public dataset." ], "metadata": { "id": "zmJzQP08sFfd" @@ -5237,7 +4519,7 @@ "base_uri": "https://localhost:8080/" }, "id": "FWa0Jq8ocMpJ", - "outputId": "121c8484-2f6c-4dbc-c401-d46354c9e06e" + "outputId": "96b34f8f-0541-498f-97eb-057883e52333" }, "execution_count": 2, "outputs": [ @@ -5254,7 +4536,9 @@ { "cell_type": "markdown", "source": [ - "## Import the parquet file into polars with pandas" + "# Load the vector data into polars (with pandas)\n", + "\n", + "Tpot uses Pandas. Therefore the data needs to be in a Pandas DataFrame. It's important to note that we want the vector dimensions to remain the same. It should be 30k. Serializing high-dimensional vector-data to parquet has been a development challenge." ], "metadata": { "id": "g7Tzv9jXsLWs" @@ -5295,7 +4579,7 @@ "id": "981FBJ_rdh-V", "outputId": "e370d4e9-078f-4276-ee6b-3aed7a33344e" }, - "execution_count": 3, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -5325,7 +4609,7 @@ "id": "ZZ1CPkp0AL-z", "outputId": "897f19f7-37ab-48a5-b36a-f55ee2c26d4e" }, - "execution_count": 10, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -5350,7 +4634,7 @@ { "cell_type": "markdown", "source": [ - "## Load data into pandas" + "# Load the vector data into pandas (direct)" ], "metadata": { "id": "_LydloD7sczq" @@ -5401,9 +4685,9 @@ "base_uri": "https://localhost:8080/" }, "id": "ePLz8udYDHsK", - "outputId": "d3dfe2e5-0665-4d4e-d831-24547e194045" + "outputId": "b5a5f575-8104-4887-d3c4-601551634b0d" }, - "execution_count": 4, + "execution_count": 1, "outputs": [ { "output_type": "stream", @@ -5532,9 +4816,9 @@ "base_uri": "https://localhost:8080/" }, "id": "W849gxLgM3vP", - "outputId": "391460a7-ac82-4a20-880a-ee95cabb23a5" + "outputId": "ea78ee8e-1ef6-4e98-87f1-e5ffcfa11566" }, - "execution_count": 5, + "execution_count": 2, "outputs": [ { "output_type": "stream", @@ -5651,24 +4935,6 @@ } ] }, - { - "cell_type": "markdown", - "source": [ - "### Read vectors into DataFrame" - ], - "metadata": { - "id": "oyMG2LKK4LqU" - } - }, - { - "cell_type": "markdown", - "source": [ - "# AutoML with TPOT (Supervised Learning)" - ], - "metadata": { - "id": "ZqGIRBU6aE3U" - } - }, { "cell_type": "code", "source": [ @@ -5695,9 +4961,9 @@ "base_uri": "https://localhost:8080/" }, "id": "9JjYJzacaD-T", - "outputId": "491257ff-d5cf-4608-e65e-499af322debc" + "outputId": "2c9214a3-b9b6-4ce6-fe6b-c88a2ea61550" }, - "execution_count": 6, + "execution_count": 3, "outputs": [ { "output_type": "stream", @@ -5749,6 +5015,15 @@ } ] }, + { + "cell_type": "markdown", + "source": [ + "# AutoML with TPOT (Supervised Learning)" + ], + "metadata": { + "id": "ZqGIRBU6aE3U" + } + }, { "cell_type": "markdown", "source": [ @@ -5771,9 +5046,9 @@ "base_uri": "https://localhost:8080/" }, "id": "5NHDYuDAiUis", - "outputId": "22308790-567a-473e-9abd-32dc8fb5034d" + "outputId": "11f23f11-961f-4a2f-a0fb-810f270c9df0" }, - "execution_count": 8, + "execution_count": 30, "outputs": [ { "output_type": "stream", @@ -5787,7 +5062,7 @@ { "cell_type": "markdown", "source": [ - "## TPOT for Supervised Learning with Algo stats" + "# TPOT for Supervised Learning with Algo stats" ], "metadata": { "id": "Gf4Fu4cDs4pr" @@ -5940,186 +5215,45 @@ "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 1000, - "referenced_widgets": [ - "e596dc64bc2145feb8039da7c626250d", - "4f8021ec3c87454e856aa23426dd92f7", - "abaf96bfaed843918b225cf759ab5f5d", - "647e90686f1d49d8b6512e9144287cb7", - "5ccef87811f84ba69ff589e4dd0a0db1", - "3811f9b18eb94a998715a71e1d80c902", - "cd69e99568cc440fb9c70c0e1327c740", - "d6aeb9c1304c478aa713252980655782", - "20a9e1000946406dbe45e5c0b92c88ab", - "50d69c11c64942e7a8551479dbf89ba5", - "e9a9ed09603a4cf38180372257723d1c", - "33d9616f5fc34716b62b6b76940de17f", - "99c1326110284c378d90c411f95d2c41", - "7d7228aabd874fe89402185e323a6d56", - "20f007cefbca4a38a9887914f9b8f908", - "44390d56d7734686b6dab24db04abd1c", - "4f9eee0acf654464bae4a250bc78aec3", - "bbc3470e99a74d9385884ce080281aed", - "0a9b4564bbc24e11aed04594e38190fb", - "122fdca0c85449e0a3cad8828bcb67df", - "6ad9c2cfa49c4b0f9843d9afb3ef4bcc", - "34a265dfe03b4c31bad5c098d2a53a57", - "66157afab32248dc8d040eebee0718d3", - "740ad9c72d404dcea2affe9118a039e8", - "4a9497a6f861409fbe56f64dabcc5aab", - "8c143fc49ac0499a9144d4705ee42fff", - "883189238c3f4928b35158622649598a", - "46e1ce81b2e54ee5936120bc892e8ddd", - "ec201971aa684b97acc1a9ca17f9d180", - "c7900ee411fb4d719907d466ebd9c96f", - "71ac2675223046a893072a3ffc361d05", - "743e80c2b3bb4269a095f6be3b7292ae", - "92088f9533bb48ccb63bf9328304b0e3" - ] + "height": 548 }, "id": "usxaQgkSbEND", - "outputId": "fa47c4f5-3209-44b5-8de9-3999ac71a173" + "outputId": "77407ba5-e4c0-4f0e-afe1-34916ea4e266" }, - "execution_count": 15, + "execution_count": 10, "outputs": [ { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "Starting run 1/3\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Optimization Progress: 0%| | 0/20 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_selection\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpreprocessing\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLabelEncoder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mQuantileTransformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecomposition\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPCA\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtpot\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTPOTClassifier\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrecall_score\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/decomposition/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextmath\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrandomized_svd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m from ._dict_learning import (\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mDictionaryLearning\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mMiniBatchDictionaryLearning\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/decomposition/_dict_learning.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0m_fit_context\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m )\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_model\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLars\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mLasso\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mLassoLars\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morthogonal_mp_gram\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_random_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgen_batches\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgen_even_slices\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_param_validation\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mHidden\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mInterval\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStrOptions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidate_params\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# complete documentation.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_base\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearRegression\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_bayes\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mARDRegression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBayesianRidge\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m from ._coordinate_descent import (\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 34\u001b[0m )\n\u001b[1;32m 35\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_random_state\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 36\u001b[0;31m from ..utils._array_api import (\n\u001b[0m\u001b[1;32m 37\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0m_average\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name '_average' from 'sklearn.utils._array_api' (/usr/local/lib/python3.10/dist-packages/sklearn/utils/_array_api.py)", + "", + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n" + ], + "errorDetails": { + "actions": [ + { + "action": "open_url", + "actionText": "Open Examples", + "url": "/notebooks/snippets/importing_libraries.ipynb" + } + ] + } } ] }, { "cell_type": "markdown", "source": [ - "### TPOT Algo Stats: Supervised Learning" + "# TPOT Algo Stats: Supervised Learning" ], "metadata": { "id": "KYJ1AkAStACV" @@ -6147,7 +5281,7 @@ "id": "ZAFOj5Ya7GhW", "outputId": "cc6590f9-1398-4aab-8813-2ad645fab113" }, - "execution_count": 16, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -6173,7 +5307,7 @@ { "cell_type": "markdown", "source": [ - "## Model export: TPOT's best pipeline, Supervised" + "# Model export: TPOT's best pipeline, Supervised" ], "metadata": { "id": "vM6TVwHYtKNa" @@ -6199,7 +5333,7 @@ "id": "ojygkYhE3B_j", "outputId": "35da40ee-f656-4b9f-818b-27e7633570d1" }, - "execution_count": 17, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -6213,7 +5347,7 @@ { "cell_type": "markdown", "source": [ - "## Evaluation of the model" + "# Evaluation of the model" ], "metadata": { "id": "lqXIlzZ0u7oN" @@ -6257,7 +5391,7 @@ "id": "PbOIGDR9cAgM", "outputId": "ce825cea-2038-4420-f857-50db6d358292" }, - "execution_count": 18, + "execution_count": null, "outputs": [ { "output_type": "display_data", @@ -6551,7 +5685,7 @@ { "cell_type": "markdown", "source": [ - "## Confusion matrix" + "# Confusion matrix" ], "metadata": { "id": "l1u70hEaAYgD" @@ -6639,7 +5773,7 @@ { "cell_type": "markdown", "source": [ - "## CV scores" + "# CV scores" ], "metadata": { "id": "0jtLBuvnAeND" @@ -6736,7 +5870,7 @@ { "cell_type": "markdown", "source": [ - "## Grid search optimization" + "# Grid search optimization" ], "metadata": { "id": "_XUIxpRdKzf9" @@ -7316,7 +6450,7 @@ { "cell_type": "markdown", "source": [ - "# Tpot (Supervised)" + "# Building a Supervised Learning classifier with Tpot (scikit Algorithms) - or an imbalanced dataset" ], "metadata": { "id": "ZWRH59wKLaAX" @@ -7879,36 +7013,10 @@ "\n", "# Print the summary table\n", "print(\"\\nTop 10 Pipelines Across All Runs and Evaluations:\")\n", - "print(tabulate(summary_table[:10], headers=['Pipeline', 'Count'], tablefmt='grid'))" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 218 - }, - "id": "9qalTrUsqpVD", - "outputId": "5b8e6c0c-5c68-4be8-9741-3acdb5dfe53e" - }, - "execution_count": 22, - "outputs": [ - { - "output_type": "error", - "ename": "TypeError", - "evalue": "'int' object is not iterable", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# Iterate through all runs and all evaluated pipelines\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresults_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mpipeline_str\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'pipelines_tested'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0msimple_pipeline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msimplify_pipeline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipeline_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mall_pipeline_stats\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0msimple_pipeline\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: 'int' object is not iterable" - ] - } - ] - }, - { - "cell_type": "code", - "source": [], + "print(tabulate(summary_table[:10], headers=['Pipeline', 'Count'], tablefmt='grid'))" + ], "metadata": { - "id": "aXVq0kzkrdu8" + "id": "9qalTrUsqpVD" }, "execution_count": null, "outputs": [] @@ -7916,7 +7024,7 @@ { "cell_type": "markdown", "source": [ - "# Building a NN classifier with Tpot for an imbalanced dataset" + "# Building a Deep Learning NN classifier with Tpot (scikit Neural Network) - for an imbalanced dataset" ], "metadata": { "id": "PEWJBJqf15m-" @@ -8202,7 +7310,7 @@ "id": "iyExRJRjsDBO", "outputId": "48d4a650-08a3-4766-c47b-058b16b49222" }, - "execution_count": 24, + "execution_count": null, "outputs": [ { "metadata": { @@ -8445,7 +7553,7 @@ "id": "s7pRao7WxkB8", "outputId": "eadc42dd-2673-4d07-f979-cb49327f30ed" }, - "execution_count": 32, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -8595,7 +7703,7 @@ "id": "cmtpHeIzxk7f", "outputId": "3ef42227-4089-4d84-a3fc-de50ab86cc99" }, - "execution_count": 33, + "execution_count": null, "outputs": [ { "output_type": "display_data", @@ -8712,7 +7820,7 @@ "id": "lUqkJTy-0SCE", "outputId": "d343d5d2-741a-4c32-f904-7f4281bf258b" }, - "execution_count": 36, + "execution_count": null, "outputs": [ { "output_type": "stream", @@ -8864,7 +7972,7 @@ { "cell_type": "markdown", "source": [ - "## Using cuML" + "# Attempt: Building a GPU-accelerated cuML classifier with Tpot - for an imbalanced dataset" ], "metadata": { "id": "QJQXi06FdUDN" @@ -8876,728 +7984,761 @@ "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", - "from sklearn.impute import SimpleImputer\n", - "from sklearn.metrics import make_scorer, confusion_matrix, recall_score, accuracy_score\n", + "from sklearn.metrics import make_scorer, confusion_matrix, recall_score, accuracy_score, f1_score\n", + "from sklearn.preprocessing import LabelEncoder, QuantileTransformer\n", "from sklearn.decomposition import TruncatedSVD\n", + "from cuml.ensemble import RandomForestClassifier\n", + "from cuml.svm import SVC\n", "from imblearn.over_sampling import SMOTE\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from tpot import TPOTClassifier\n", - "from collections import defaultdict\n", "import time\n", "import gc\n", + "import warnings\n", "\n", - "def print_step_info(X, y, step_name):\n", - " print(f\"\\n--- {step_name} ---\")\n", - " print(f\"X shape: {X.shape}\")\n", - " print(f\"y shape: {y.shape}\")\n", - " print(f\"Class distribution: {np.bincount(y)}\")\n", + "# Suppress warnings\n", + "warnings.filterwarnings('ignore')\n", "\n", "def custom_scorer(y_true, y_pred):\n", " cm = confusion_matrix(y_true, y_pred)\n", - " tn, fp, fn, tp = cm.ravel()\n", - " if (tp + fn) == 0 or (tn + fp) == 0:\n", - " return 0.0\n", - " recall = tp / (tp + fn + 1e-8)\n", - " specificity = tn / (tn + fp + 1e-8)\n", - " recall_weight = 1.0\n", - " specificity_weight = 2.0\n", - " score = (recall_weight * recall + specificity_weight * specificity) / (recall_weight + specificity_weight)\n", - " return score\n", - "\n", - "custom_scorer_obj = make_scorer(custom_scorer, greater_is_better=True)\n", - "\n", - "# Assuming df is already loaded and contains 'message_vector' and 'label' columns\n", - "\n", - "# Encode labels\n", - "le = LabelEncoder()\n", - "df['label_encoded'] = le.fit_transform(df['label'])\n", - "\n", - "# Split data\n", - "X = np.array(df['message_vector'].tolist(), dtype='float32')\n", - "y = df['label_encoded'].values\n", - "\n", - "print_step_info(X, y, \"Original Data\")\n", - "\n", - "# Preprocessing\n", - "imputer = SimpleImputer(strategy='mean')\n", - "scaler = StandardScaler()\n", - "pca = TruncatedSVD(n_components=100, random_state=42) # Reduce to 100 components\n", - "\n", - "X = imputer.fit_transform(X)\n", - "X = scaler.fit_transform(X)\n", - "X = pca.fit_transform(X)\n", - "\n", - "print_step_info(X, y, \"After Preprocessing\")\n", - "\n", - "# Split the data\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "print_step_info(X_train, y_train, \"Training Data\")\n", - "\n", - "# Resampling\n", - "smote = SMOTE(sampling_strategy=0.5, random_state=42)\n", - "rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)\n", - "X_resampled, y_resampled = rus.fit_resample(*smote.fit_resample(X_train, y_train))\n", - "print_step_info(X_resampled, y_resampled, \"Resampled Data\")\n", - "\n", - "# Initialize results storage\n", - "results = defaultdict(list)\n", - "\n", - "# Number of runs\n", - "n_runs = 1\n", - "\n", - "# Initialize best_tpot and best_recall\n", - "best_tpot = None\n", - "best_recall = 0\n", - "\n", - "for run in range(n_runs):\n", - " print(f\"\\nStarting run {run + 1}/{n_runs}\")\n", - " start_time = time.time()\n", - "\n", - " # Initialize TPOT\n", - " tpot = TPOTClassifier(\n", - " generations=5,\n", - " population_size=20,\n", - " verbosity=2,\n", - " scoring=custom_scorer_obj,\n", - " random_state=42 + run,\n", - " config_dict=\"TPOT cuML\",\n", - " n_jobs=1, # cuML requires n_jobs=1\n", - " cv=5,\n", - " max_time_mins=120,\n", - " max_eval_time_mins=20\n", - " )\n", - "\n", - " try:\n", - " # Fit TPOT\n", - " tpot.fit(X_resampled, y_resampled)\n", - "\n", - " # Make predictions\n", - " y_pred = tpot.predict(X_test)\n", - "\n", - " # Calculate recall and accuracy scores\n", - " recall = recall_score(y_test, y_pred, average='macro')\n", - " accuracy = accuracy_score(y_test, y_pred)\n", - "\n", - " # Update best_tpot if this run has better recall score\n", - " if recall > best_recall:\n", - " best_recall = recall\n", - " best_tpot = tpot\n", - "\n", - " # Get pipeline string\n", - " pipeline_str = str(tpot.fitted_pipeline_)\n", - "\n", - " # Store results\n", - " results['run'].append(run + 1)\n", - " results['recall_score'].append(recall)\n", - " results['accuracy_score'].append(accuracy)\n", - " results['best_pipeline'].append(pipeline_str)\n", - " results['runtime'].append(time.time() - start_time)\n", - " results['pipelines_tested'].append(len(tpot.evaluated_individuals_))\n", - "\n", - " print(f\"Run {run + 1} completed. Recall Score: {recall:.4f}, Accuracy: {accuracy:.4f}, Pipelines tested: {len(tpot.evaluated_individuals_)}\")\n", - "\n", - " except Exception as e:\n", - " print(f\"An error occurred during TPOT optimization: {str(e)}\")\n", - "\n", - " # Clear memory\n", - " gc.collect()\n", - "\n", - "# Convert results to DataFrame\n", - "results_df = pd.DataFrame(results)\n", - "\n", - "# Save results to CSV\n", - "results_df.to_csv('tpot_cuml_recall_results.csv', index=False)\n", - "\n", - "print(\"\\nResults saved to 'tpot_cuml_recall_results.csv'\")\n", - "\n", - "# If best_tpot exists, export the best pipeline\n", - "if best_tpot:\n", - " best_tpot.export('tpot_cuml_best_pipeline.py')\n", - " print(\"Best pipeline exported to 'tpot_cuml_best_pipeline.py'\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000, - "referenced_widgets": [ - "92ce081546c945089cdd8b00604f9190", - "cfb57c0da6cf422f8e56a68fd2fa8620", - "fa57f776a9044f928368ff7299e01bd9", - "2f1b25096e404d069ef24ee45f236a64", - "08bb4312d1a948e986a03788d286dda1", - "17cc427fd9e9467e999b9164774b6eb5", - "db4793323d154fdba9d4d88d5789d8dd", - "cd4d7a2f6adc4f69b07b1b1230153d44", - "18ff42e892cf489fb6fa9003fbdecc6c", - "fcf7b83de17d491b8729ce64d8f3cede", - "695f315cc88c477084e7737b132ed93a" - ] - }, - "id": "BDkHYUyp1vWI", - "outputId": "c6c5991b-5978-400e-c571-20290e98ecfc" - }, - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "--- Original Data ---\n", - "X shape: (13455, 30000)\n", - "y shape: (13455,)\n", - "Class distribution: [ 114 13341]\n", - "\n", - "--- After Preprocessing ---\n", - "X shape: (13455, 100)\n", - "y shape: (13455,)\n", - "Class distribution: [ 114 13341]\n", - "\n", - "--- Training Data ---\n", - "X shape: (10764, 100)\n", - "y shape: (10764,)\n", - "Class distribution: [ 91 10673]\n", - "\n", - "--- Resampled Data ---\n", - "X shape: (16008, 100)\n", - "y shape: (16008,)\n", - "Class distribution: [ 5336 10672]\n", - "\n", - "Starting run 1/1\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - "Optimization Progress: 0%| | 0/20 [00:00 1 else np.nan\n", - "\n", - " print(f\"\\n{metric.capitalize()} Analysis:\")\n", - " print(f\" Mean: {mean:.4f}\")\n", - " print(f\" Standard Deviation: {std:.4f}\")\n", - "\n", - " if len(scores) > 1:\n", - " confidence_level = 0.95\n", - " se = std / np.sqrt(len(scores))\n", - " ci = stats.t.interval(confidence_level, len(scores)-1, loc=mean, scale=se)\n", - " print(f\" {confidence_level*100}% Confidence Interval: ({ci[0]:.4f}, {ci[1]:.4f})\")\n", - "\n", - " # Perform t-test to check if scores are significantly different from a baseline\n", - " baseline = 0.5 # You can adjust this value based on your expectations\n", - " t_stat, p_value = stats.ttest_1samp(scores, baseline)\n", - " print(f\"\\nOne-sample t-test (comparing to baseline of {baseline}):\")\n", - " print(f\" t-statistic: {t_stat:.4f}\")\n", - " print(f\" p-value: {p_value:.4f}\")\n", - "\n", - " if len(scores) >= 3:\n", - " # Check for normality of scores\n", - " _, normality_p_value = stats.shapiro(scores)\n", - " print(f\"\\nShapiro-Wilk test for normality of {metric}:\")\n", - " print(f\" p-value: {normality_p_value:.4f}\")\n", - " print(f\" {'Scores are normally distributed' if normality_p_value > 0.05 else 'Scores are not normally distributed'}\")\n", - "\n", - " # Calculate effect size (Cohen's d) for scores compared to baseline\n", - " effect_size = (mean - baseline) / std\n", - " print(f\"\\nEffect size (Cohen's d) compared to baseline: {effect_size:.4f}\")\n", - "\n", - " # Plot scores if there's more than one run\n", - " if len(scores) > 1:\n", - " plt.figure(figsize=(10, 6))\n", - " plt.plot(results_df['run'], scores, 'bo-')\n", - " plt.title(f'{metric.capitalize()} Across Runs')\n", - " plt.xlabel('Run')\n", - " plt.ylabel(metric.capitalize())\n", - " plt.grid(True)\n", - " plt.tight_layout()\n", - " plt.show()\n", - "\n", - "# Runtime analysis\n", - "print(\"\\nRuntime Analysis:\")\n", - "print(f\" Total Runtime: {results_df['runtime'].sum():.2f} seconds\")\n", - "print(f\" Average Runtime per Run: {results_df['runtime'].mean():.2f} seconds\")\n", - "\n", - "# Pipelines tested analysis\n", - "print(\"\\nPipelines Tested Analysis:\")\n", - "print(f\" Total Pipelines Tested: {results_df['pipelines_tested'].sum()}\")\n", - "print(f\" Average Pipelines per Run: {results_df['pipelines_tested'].mean():.2f}\")\n", - "\n", - "# Best pipeline (in this case, the only run)\n", - "best_run = results_df.iloc[0]\n", - "print(f\"\\nBest Run (Run {best_run['run']}):\")\n", - "print(f\" Recall Score: {best_run['recall_score']:.4f}\")\n", - "print(f\" Accuracy Score: {best_run['accuracy_score']:.4f}\")\n", - "print(f\" Runtime: {best_run['runtime']:.2f} seconds\")\n", - "print(f\" Pipelines Tested: {best_run['pipelines_tested']}\")\n", - "print(f\" Best Pipeline:\\n{best_run['best_pipeline']}\")\n", - "\n", - "# Plot runtime vs recall and accuracy scores only if there's more than one run\n", - "if len(results_df) > 1:\n", - " fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))\n", - " ax1.scatter(results_df['runtime'], results_df['recall_score'])\n", - " ax1.set_title('Runtime vs Recall Score')\n", - " ax1.set_xlabel('Runtime (seconds)')\n", - " ax1.set_ylabel('Recall Score')\n", - " ax1.grid(True)\n", - "\n", - " ax2.scatter(results_df['runtime'], results_df['accuracy_score'])\n", - " ax2.set_title('Runtime vs Accuracy Score')\n", - " ax2.set_xlabel('Runtime (seconds)')\n", - " ax2.set_ylabel('Accuracy Score')\n", - " ax2.grid(True)\n", - "\n", - " plt.tight_layout()\n", - " plt.show()\n", - "\n", - " # Analyze the relationship between runtime and scores\n", - " for metric in ['recall_score', 'accuracy_score']:\n", - " corr, _ = stats.pearsonr(results_df['runtime'], results_df[metric])\n", - " print(f\"\\nCorrelation between runtime and {metric}: {corr:.4f}\")\n", - "else:\n", - " print(\"\\nInsufficient data for runtime vs score analysis and correlation.\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "vD-eHcZD-wdY", - "outputId": "14fc9a53-c9b7-47f0-fe50-492797eed905" - }, - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "Summary Statistics:\n", - " run recall_score accuracy_score runtime pipelines_tested\n", - "count 1.0 1.000000 1.000000 1.000000 1.0\n", - "mean 1.0 0.889055 0.993683 194.357202 115.0\n", - "std NaN NaN NaN NaN NaN\n", - "min 1.0 0.889055 0.993683 194.357202 115.0\n", - "25% 1.0 0.889055 0.993683 194.357202 115.0\n", - "50% 1.0 0.889055 0.993683 194.357202 115.0\n", - "75% 1.0 0.889055 0.993683 194.357202 115.0\n", - "max 1.0 0.889055 0.993683 194.357202 115.0\n", - "\n", - "Recall_score Analysis:\n", - " Mean: 0.8891\n", - " Standard Deviation: nan\n", - "\n", - "Accuracy_score Analysis:\n", - " Mean: 0.9937\n", - " Standard Deviation: nan\n", - "\n", - "Runtime Analysis:\n", - " Total Runtime: 194.36 seconds\n", - " Average Runtime per Run: 194.36 seconds\n", - "\n", - "Pipelines Tested Analysis:\n", - " Total Pipelines Tested: 115\n", - " Average Pipelines per Run: 115.00\n", - "\n", - "Best Run (Run 1):\n", - " Recall Score: 0.8891\n", - " Accuracy Score: 0.9937\n", - " Runtime: 194.36 seconds\n", - " Pipelines Tested: 115\n", - " Best Pipeline:\n", - "Pipeline(steps=[('variancethreshold', VarianceThreshold(threshold=0.05)),\n", - " ('xgbclassifier',\n", - " XGBClassifier(alpha=1, base_score=None, booster=None,\n", - " callbacks=None, colsample_bylevel=None,\n", - " colsample_bynode=None, colsample_bytree=None,\n", - " device=None, early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None,\n", - " feature_types=None, gamma=None, grow_policy=None,\n", - " importance_type=None,\n", - " interaction_constraints=None, learning_rate=1.0,\n", - " max_bin=None, max_cat_threshold=None,\n", - " max_cat_to_onehot=None, max_delta_step=None,\n", - " max_depth=9, max_leaves=None,\n", - " min_child_weight=10, missing=nan,\n", - " monotone_constraints=None, multi_strategy=None,\n", - " n_estimators=100, n_jobs=1,\n", - " num_parallel_tree=None, ...))])\n", - "\n", - "Insufficient data for runtime vs score analysis and correlation.\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import re\n", + " tn, fp, fn, tp = cm.ravel()\n", + " if (tp + fn) == 0 or (tn + fp) == 0:\n", + " return 0.0\n", + " recall = tp / (tp + fn + 1e-8)\n", + " specificity = tn / (tn + fp + 1e-8)\n", + " recall_weight = 1.0\n", + " specificity_weight = 2.0\n", + " score = (recall_weight * recall + specificity_weight * specificity) / (recall_weight + specificity_weight)\n", + " return score\n", "\n", - "# Assuming results_df is already loaded from 'tpot_nn_recall_results.csv'\n", - "# If not, uncomment the following line:\n", - "# results_df = pd.read_csv('tpot_nn_recall_results.csv')\n", + "custom_scorer_obj = make_scorer(custom_scorer, greater_is_better=True)\n", "\n", - "def extract_pipeline_components(pipeline_str):\n", - " # Extract all components from the pipeline string, excluding 'Pipeline'\n", - " components = re.findall(r'(\\w+)\\(', pipeline_str)\n", - " return ', '.join([comp for comp in components if comp != 'Pipeline'])\n", + "# Assuming df is already loaded and contains 'message_vector' and 'label' columns\n", "\n", - "# Extract pipeline components\n", - "results_df['pipeline_components'] = results_df['best_pipeline'].apply(extract_pipeline_components)\n", + "# Prepare data\n", + "X = np.array(df['message_vector'].tolist(), dtype='float32')\n", + "y = df['label'].values\n", "\n", - "# Create a table of pipeline components and their frequencies\n", - "component_counts = results_df['pipeline_components'].value_counts().reset_index()\n", - "component_counts.columns = ['Pipeline Components', 'Frequency']\n", - "component_counts['Percentage'] = component_counts['Frequency'] / len(results_df) * 100\n", + "# Ensure y is numeric\n", + "le = LabelEncoder()\n", + "y = le.fit_transform(y)\n", "\n", - "print(\"Pipeline Components Frequency Table:\")\n", - "print(component_counts.to_string(index=False))\n", + "# Preprocessing\n", + "qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal')\n", + "pca = TruncatedSVD(n_components=min(100, X.shape[1] - 1), random_state=42)\n", + "X = qt.fit_transform(X)\n", + "X = pca.fit_transform(X)\n", "\n", - "# Visualize pipeline components frequency\n", - "plt.figure(figsize=(12, 6))\n", - "component_counts.plot(kind='bar', x='Pipeline Components', y='Frequency')\n", - "plt.title('Frequency of Pipeline Components in Best Pipelines')\n", - "plt.xlabel('Pipeline Components')\n", - "plt.ylabel('Frequency')\n", - "plt.xticks(rotation=45, ha='right')\n", - "plt.tight_layout()\n", - "plt.show()\n", + "# Ensure X is float32 (required for cuML)\n", + "X = X.astype(np.float32)\n", "\n", - "# Find the best performing pipeline components\n", - "best_components = results_df.loc[results_df['recall_score'].idxmax(), 'pipeline_components']\n", - "print(f\"\\nComponents of the best performing pipeline: {best_components}\")\n", + "# Split the data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", "\n", - "# Analyze individual components\n", - "all_components = [comp.strip() for comps in results_df['pipeline_components'].str.split(',') for comp in comps]\n", - "component_freq = pd.Series(all_components).value_counts()\n", + "# Resampling\n", + "smote = SMOTE(sampling_strategy=0.5, random_state=42)\n", + "rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)\n", + "X_train, y_train = rus.fit_resample(*smote.fit_resample(X_train, y_train))\n", "\n", - "print(\"\\nIndividual Component Frequency:\")\n", - "print(component_freq)\n", + "# Ensure y_train is int32 (required for cuML)\n", + "y_train = y_train.astype(np.int32)\n", "\n", - "# Visualize individual component frequency\n", - "plt.figure(figsize=(10, 6))\n", - "component_freq.plot(kind='bar')\n", - "plt.title('Frequency of Individual Components in Best Pipelines')\n", - "plt.xlabel('Component')\n", - "plt.ylabel('Frequency')\n", - "plt.xticks(rotation=45, ha='right')\n", - "plt.tight_layout()\n", - "plt.show()\n", + "# Custom configuration for TPOT\n", + "tpot_config = {\n", + " 'cuml.ensemble.RandomForestClassifier': {\n", + " 'n_estimators': [100, 200, 500],\n", + " 'max_depth': [10, 20, 30, None],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'max_features': ['sqrt', 'log2', 0.1, 0.3, 0.5],\n", + " 'class_weight': [None, 'balanced', 'balanced_subsample'],\n", + " # 'criterion': ['gini', 'entropy'],\n", + " 'bootstrap': [True, False]\n", + " }\n", + "}\n", "\n", - "# Additional analysis: Component combination performance\n", - "results_df['mean_recall'] = results_df.groupby('pipeline_components')['recall_score'].transform('mean')\n", - "best_combo = results_df.loc[results_df['mean_recall'].idxmax(), 'pipeline_components']\n", - "best_combo_recall = results_df.loc[results_df['mean_recall'].idxmax(), 'mean_recall']\n", + "# Initialize and run TPOT\n", + "tpot = TPOTClassifier(\n", + " generations=5,\n", + " population_size=20,\n", + " cv=5,\n", + " scoring=custom_scorer_obj,\n", + " random_state=42,\n", + " config_dict=tpot_config,\n", + " verbosity=2,\n", + " n_jobs=1,\n", + " max_time_mins=60,\n", + " max_eval_time_mins=5\n", + ")\n", "\n", - "print(f\"\\nBest performing component combination: {best_combo}\")\n", - "print(f\"Mean recall score: {best_combo_recall:.4f}\")\n", + "start_time = time.time()\n", + "tpot.fit(X_train, y_train)\n", + "print(f\"TPOT optimization completed in {time.time() - start_time:.2f} seconds\")\n", "\n", - "# [Rest of the previous analysis code remains the same]\n", + "# Evaluate the best pipeline\n", + "y_pred = tpot.predict(X_test)\n", + "recall = recall_score(y_test, y_pred, average='macro')\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "tn, fp, fn, tp = cm.ravel()\n", + "specificity = tn / (tn + fp + 1e-8)\n", + "accuracy = accuracy_score(y_test, y_pred)\n", + "f1 = f1_score(y_test, y_pred, average='weighted')\n", + "custom_score = custom_scorer(y_test, y_pred)\n", "\n", - "# ... [Keep all the previous analysis code] ..." + "print(f\"Best pipeline: {tpot.fitted_pipeline_}\")\n", + "print(f\"Recall Score: {recall:.4f}\")\n", + "print(f\"Specificity: {specificity:.4f}\")\n", + "print(f\"Accuracy: {accuracy:.4f}\")\n", + "print(f\"F1 Score: {f1:.4f}\")\n", + "print(f\"Custom Score: {custom_score:.4f}\")\n", + "\n", + "print(\"Confusion Matrix:\")\n", + "print(cm)\n", + "\n", + "# Export the best pipeline\n", + "tpot.export('tpot_best_pipeline.py')\n", + "print(\"Best pipeline exported to 'tpot_best_pipeline.py'\")\n", + "\n", + "gc.collect()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 1000 + "height": 1000, + "referenced_widgets": [ + "a720f2592e3d4bef98a4c6f5077f277f", + "d2ae2ec449c14266b009f213df117da2", + "979b4554ad0b474b883040d9bbd51274", + "be36aefa1b3e4ea6a776e02c082c9958", + "7fc5e7ea894245579a1abdeed4492954", + "88b18cc3b9cf4eaa8edc5b16aaf28821", + "d5935c9201ed471ab2385df6e607380c", + "4aa0b28b2aa44951aad3bfe37fa20051", + "43490d899aed4338823374634d05c9f1", + "512a81b06bd54811bd8b9f80958fa7a4", + "201d44e85f374a06929ae8db5851d61b" + ] }, - "id": "tmGKt3iB_ze9", - "outputId": "2009081b-49e0-421e-d1a2-ece6eb3526ab" + "id": "vD-eHcZD-wdY", + "outputId": "afd6d273-d0bc-4353-cb2a-b2d6ef458cd1" }, - "execution_count": 11, + "execution_count": 9, "outputs": [ { "output_type": "stream", - "name": "stdout", + "name": "stderr", "text": [ - "Pipeline Components Frequency Table:\n", - " Pipeline Components Frequency Percentage\n", - "VarianceThreshold, XGBClassifier 1 100.0\n" + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ - "
" - ] + "Optimization Progress: 0%| | 0/20 [00:00" - ], - "image/png": "\n" - }, - "metadata": {} + "output_type": "stream", + "name": "stderr", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Generation 1 - Current best internal CV score: 0.9999062792799144\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n" + ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", - "Components of the best performing pipeline: VarianceThreshold, XGBClassifier\n", + "Generation 2 - Current best internal CV score: 0.9999062792799144\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ "\n", - "Individual Component Frequency:\n", - "VarianceThreshold 1\n", - "XGBClassifier 1\n", - "Name: count, dtype: int64\n" + "Generation 3 - Current best internal CV score: 0.9999375195173401\n" ] }, { - "output_type": "display_data", - "data": { - "text/plain": [ - "
" - ], - "image/png": "\n" - }, - "metadata": {} + "output_type": "stream", + "name": "stderr", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Generation 4 - Current best internal CV score: 0.9999375195173401\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n", + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"randomforestclassifier.pyx\", line 314, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__\n", + " File \"randomforestclassifier.pyx\", line 318, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data\n", + " File \"base.pyx\", line 330, in cuml.internals.base.Base.__getattr__\n", + "AttributeError: rf_forest\n" + ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n", - "Best performing component combination: VarianceThreshold, XGBClassifier\n", - "Mean recall score: 0.8891\n" + "Generation 5 - Current best internal CV score: 0.9999375195173401\n", + "\n", + "Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, input_matrix), bootstrap=False, class_weight=None, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200)\n", + "TPOT optimization completed in 493.37 seconds\n", + "Best pipeline: Pipeline(steps=[('featureunion',\n", + " FeatureUnion(transformer_list=[('functiontransformer-1',\n", + " FunctionTransformer(func=)),\n", + " ('functiontransformer-2',\n", + " FunctionTransformer(func=))])),\n", + " ('randomforestclassifier', RandomForestClassifier())])\n", + "Recall Score: 0.5652\n", + "Specificity: 0.1304\n", + "Accuracy: 0.9926\n", + "F1 Score: 0.9897\n", + "Custom Score: 0.4203\n", + "Confusion Matrix:\n", + "[[ 3 20]\n", + " [ 0 2668]]\n", + "Best pipeline exported to 'tpot_best_pipeline.py'\n" ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2071" + ] + }, + "metadata": {}, + "execution_count": 9 } ] }, @@ -9608,7 +8749,7 @@ "import matplotlib.pyplot as plt\n", "\n", "# Make predictions\n", - "y_pred = best_tpot.predict(X_test)\n", + "y_pred = tpot.predict(X_test)\n", "\n", "# Create the confusion matrix\n", "cm = confusion_matrix(y_test, y_pred)\n", @@ -9649,9 +8790,9 @@ "height": 653 }, "id": "lVua-bgO_9T8", - "outputId": "9442618d-abb0-4f6c-940f-2cf17041cd3b" + "outputId": "67a8ddee-ab06-4b6e-a946-9ccb175b7401" }, - "execution_count": 13, + "execution_count": 11, "outputs": [ { "output_type": "display_data", @@ -9668,7 +8809,7 @@ "text/plain": [ "
" ], - "image/png": "\n" + "image/png": "\n" }, "metadata": {} }, @@ -9677,14 +8818,103 @@ "name": "stdout", "text": [ "Confusion Matrix:\n", - "[[ 18 5]\n", - " [ 12 2656]]\n", + "[[ 3 20]\n", + " [ 0 2668]]\n", + "\n", + "Precision: 0.9926\n", + "Recall: 1.0000\n", + "F1-score: 0.9963\n", + "Specificity: 0.1304\n", + "Custom Score: 0.4203\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.decomposition import TruncatedSVD\n", + "\n", + "# Assume these are saved from the training process\n", + "# saved_imputer = imputer # The SimpleImputer used in training\n", + "\n", + "\n", + "# Vectorize the messages\n", + "vector1 = vectorize_text(random_good_message)\n", + "vector2 = vectorize_text(random_bad_message)\n", + "\n", + "print(\"Shape of vector1:\", vector1.shape)\n", + "print(\"Shape of vector2:\", vector2.shape)\n", + "\n", + "# Combine the vectors\n", + "X_new = np.vstack((vector1, vector2))\n", + "\n", + "print(\"Shape of X_new before preprocessing:\", X_new.shape)\n", + "\n", + "\n", + "X_new = qt.transform(X_new)\n", + "print(\"Shape after scaling:\", X_new.shape)\n", + "\n", + "X_new = pca.transform(X_new)\n", + "print(\"Shape after PCA:\", X_new.shape)\n", + "\n", + "# Ensure X_new has exactly 100 features\n", + "assert X_new.shape[1] == 100, f\"X_new has {X_new.shape[1]} features instead of 100\"\n", + "\n", + "# Make predictions using your TPOT model\n", + "predictions = tpot.predict(X_new)\n", + "probabilities = tpot.predict_proba(X_new)\n", + "\n", + "print(random_bad_message)\n", + "\n", + "# Print results\n", + "for i, (pred, prob) in enumerate(zip(predictions, probabilities)):\n", + " print(f\"Message {i+1}:\")\n", + " print(f\"Predicted class: {pred}\")\n", + " print(f\"Class probabilities: {prob}\")\n", + " print()\n", + "\n", + "# Print the shape of X_new for final confirmation\n", + "print(f\"Final shape of X_new: {X_new.shape}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PTB4jZoZBDQU", + "outputId": "cd93f788-e7b0-4b9f-cb33-33d5f617b933" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Shape of vector1: (1, 30000)\n", + "Shape of vector2: (1, 30000)\n", + "Shape of X_new before preprocessing: (2, 30000)\n", + "Shape after scaling: (2, 30000)\n", + "Shape after PCA: (2, 100)\n", + "File created:\n", + "RuleName: EXE\n", + "UtcTime: 2024-07-28 15:53:22.557\n", + "ProcessGuid: {18e8265a-696d-66a6-7705-000000004400}\n", + "ProcessId: 10624\n", + "Image: C:\\Program Files\\Microsoft Office\\Root\\Office16\\EXCEL.EXE\n", + "TargetFilename: C:\\Users\\student\\AppData\\Local\\Temp\\file.exe\n", + "CreationUtcTime: 2024-07-23 14:24:50.520\n", + "Message 1:\n", + "Predicted class: 1.0\n", + "Class probabilities: [0.12708335 0.8729167 ]\n", + "\n", + "Message 2:\n", + "Predicted class: 1.0\n", + "Class probabilities: [0.12375 0.87625]\n", "\n", - "Precision: 0.9981\n", - "Recall: 0.9955\n", - "F1-score: 0.9968\n", - "Specificity: 0.7826\n", - "Custom Score: 0.8536\n" + "Final shape of X_new: (2, 100)\n" ] } ]