Merge pull request #101 from zStupan/main

Updated docs and examples
firefly-cpp · Oct 31, 2023 · fc7ae85 · fc7ae85
2 parents 5cacca0 + f714e70
commit fc7ae85
Show file tree

Hide file tree

Showing 8 changed files with 240 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -92,7 +92,9 @@ data = Dataset('datasets/Abalone.csv')
 print(data)
 ```
 
-### Data Squashing
+### Preprocessing
+
+#### Data Squashing
 
 Optionally, a preprocessing technique, called data squashing [5], can be applied. This will significantly reduce the number of transactions, while providing similar results to the original dataset.
 
@@ -104,7 +106,9 @@ squashed = squash(dataset, threshold=0.9, similarity='euclidean')
 print(squashed)
 ```
 
-### Mining association rules the easy way (recommended)
+### Mining association rules
+
+#### The easy way (recommended)
 
 Association rule mining can be easily performed using the `get_rules` function:
 
@@ -124,7 +128,7 @@ print(f'Run Time: {run_time}')
 rules.to_csv('output.csv')
 ```
 
-### Mining association rules the hard way
+#### The hard way
 
 The above example can be also be implemented using a more low level interface,
 with the `NiaARM` class directly:
@@ -137,7 +141,7 @@ from niapy.task import Task, OptimizationType
 
 data = Dataset("datasets/Abalone.csv")
 
-# Create a problem:::
+# Create a problem
 # dimension represents the dimension of the problem;
 # features represent the list of features, while transactions depicts the list of transactions
 # metrics is a sequence of metrics to be taken into account when computing the fitness;
@@ -162,6 +166,12 @@ problem.rules.sort()
 problem.rules.to_csv('output.csv')
 ```
 
+#### Interest measures
+
+The framework implements several popular interest measures, which can be used to compute the fitness function value of rules
+and for assessing the quality of the mined rules. A full list of the implemented interest measures along with their descriptions
+and equations can be found [here](interest_measures.md).
+
 ### Visualization
 
 The framework currently supports the hill slopes visualization method presented in [4]. More visualization methods are planned
@@ -207,13 +217,9 @@ algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
 metrics = ('support', 'confidence', 'aws')
 rules, time = get_text_rules(corpus, max_terms=5, algorithm=algorithm, metrics=metrics, max_evals=10000, logging=True)
 
-if len(rules):
-    print(rules)
-    print(f'Run time: {time:.2f}s')
-    rules.to_csv('output.csv')
-else:
-    print('No rules generated')
-    print(f'Run time: {time:.2f}s')
+print(rules)
+print(f'Run time: {time:.2f}s')
+rules.to_csv('output.csv')
 ```
 
 **Note:** You may need to download stopwords and the punkt tokenizer from nltk by running `import nltk; nltk.download('stopwords'); nltk.download('punkt')`.

diff --git a/examples/basic_run.py b/examples/basic_run.py
@@ -8,7 +8,7 @@
     # load and preprocess the dataset from csv
     data = Dataset("datasets/Abalone.csv")
 
-    # Create a problem:::
+    # Create a problem
     # dimension represents the dimension of the problem;
     # features represent the list of features, while transactions depicts the list of transactions
     # the following 4 elements represent weights (support, confidence, coverage, shrinkage)

diff --git a/examples/basic_run_with_get_rules.py b/examples/basic_run_with_get_rules.py
@@ -1,13 +1,18 @@
 from niaarm import Dataset, get_rules
 from niapy.algorithms.basic import DifferentialEvolution
 
-
+# load dataset
 data = Dataset("datasets/Abalone.csv")
+
+# initialize the algorithm
 algo = DifferentialEvolution(
     population_size=50, differential_weight=0.5, crossover_probability=0.9
 )
+
+# define metrics to be used in fitness computation
 metrics = ("support", "confidence")
 
+# mine association rules
 res = get_rules(data, algo, metrics, max_iters=30, logging=True)
 # or rules, run_time = get_rules(...)
 

diff --git a/examples/data_squashing.py b/examples/data_squashing.py
@@ -1,7 +1,11 @@
 from niaarm.dataset import Dataset
 from niaarm.preprocessing import squash
 
-
+# load dataset
 dataset = Dataset("datasets/Abalone.csv")
+
+# squash the dataset with a threshold of 0.9, using Euclidean distance as a similarity measure
 squashed = squash(dataset, threshold=0.9, similarity="euclidean")
+
+# print the squashed dataset
 print(squashed)
diff --git a/examples/text_mining.py b/examples/text_mining.py
@@ -3,9 +3,11 @@
 from niaarm.mine import get_text_rules
 from niapy.algorithms.basic import ParticleSwarmOptimization
 
+# load corpus and extract the documents as a list of strings
 df = pd.read_json("datasets/text/artm_test_dataset.json", orient="records")
 documents = df["text"].tolist()
 
+# create a Corpus object from the documents (requires nltk's punkt tokenizer and the stopwords list)
 try:
     corpus = Corpus.from_list(documents)
 except LookupError:
@@ -15,21 +17,21 @@
     nltk.download("stopwords")
     corpus = Corpus.from_list(documents)
 
+# the rest is pretty much the same as with the numerical association rules
+# 1. Init algorithm
+# 2. Define metrics
+# 3. Run algorithm
 algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
 metrics = ("support", "confidence", "aws")
 rules, time = get_text_rules(
     corpus,
-    max_terms=5,
+    max_terms=8,
     algorithm=algorithm,
     metrics=metrics,
     max_evals=10000,
     logging=True,
 )
 
-if len(rules):
-    print(rules)
-    print(f"Run time: {time:.2f}s")
-    rules.to_csv("output.csv")
-else:
-    print("No rules generated")
-    print(f"Run time: {time:.2f}s")
+print(rules)
+print(f"Run time: {time:.2f}s")
+rules.to_csv("output.csv")
diff --git a/examples/visualization.py b/examples/visualization.py
@@ -2,11 +2,14 @@
 from niaarm import Dataset, get_rules
 from niaarm.visualize import hill_slopes
 
+# Load dataset and mine rules
 dataset = Dataset("datasets/Abalone.csv")
 metrics = ("support", "confidence")
 rules, _ = get_rules(
     dataset, "DifferentialEvolution", metrics, max_evals=1000, seed=1234
 )
+
+# Visualize any rule using the hill_slope function like so:
 some_rule = rules[150]
 print(some_rule)
 fig, ax = hill_slopes(some_rule, dataset.transactions)

diff --git a/interest_measures.md b/interest_measures.md
@@ -0,0 +1,194 @@
+# Interest Measures
+
+## Support
+
+Support is defined on an itemset as the proportion of transactions that contain the attribute $`X`$.
+
+```math
+supp(X) = \frac{n_{X}}{|D|},
+```
+
+where $`|D|`$ is the number of records in the transactional database.
+
+For an association rule, support is defined as the support of all the attributes in the rule.
+
+```math
+supp(X \implies Y) = \frac{n_{XY}}{|D|}
+```
+
+**Range:** $`[0, 1]`$
+
+**Reference:** Michael Hahsler, A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules,
+2015, URL: https://mhahsler.github.io/arules/docs/measures
+
+## Confidence
+
+Confidence of the rule, defined as the proportion of transactions that contain
+the consequent in the set of transactions that contain the antecedent. This proportion is an estimate
+of the probability of seeing the consequent, if the antecedent is present in the transaction.
+
+```math
+conf(X \implies Y) = \frac{supp(X \implies Y)}{supp(X)}
+```
+
+**Range:** $`[0, 1]`$
+
+**Reference:** Michael Hahsler, A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules,
+2015, URL: https://mhahsler.github.io/arules/docs/measures
+
+## Lift
+
+Lift measures how many times more often the antecedent and the consequent Y
+occur together than expected if they were statistically independent.
+
+```math
+lift(X \implies Y) = \frac{conf(X \implies Y)}{supp(Y)}
+```
+
+**Range:** $`[0, \infty]`$ (1 means independence)
+
+**Reference:** Michael Hahsler, A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules,
+2015, URL: https://mhahsler.github.io/arules/docs/measures
+
+## Coverage
+
+Coverage, also known as antecedent support, is an estimate of the probability that
+the rule applies to a randomly selected transaction. It is the proportion of transactions
+that contain the antecedent.
+
+```math
+cover(X \implies Y) = supp(X)
+```
+
+**Range:** $`[0, 1]`$
+
+**Reference:** Michael Hahsler, A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules,
+2015, URL: https://mhahsler.github.io/arules/docs/measures
+
+## RHS Support
+
+Support of the consequent.
+
+```math
+RHSsupp(X \implies Y) = supp(Y)
+```
+
+**Range:** $`[0, 1]`$
+
+**Reference:** Michael Hahsler, A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules,
+2015, URL: https://mhahsler.github.io/arules/docs/measures
+
+## Conviction
+
+Conviction can be interpreted as the ratio of the expected frequency that the antecedent occurs without
+the consequent.
+
+```math
+conv(X \implies Y) = \frac{1 - supp(Y)}{1 - conf(X \implies Y)}
+```
+
+**Range:** $`[0, \infty]`$ (1 means independence, $`\infty`$ means the rule always holds)
+
+**Reference:** Michael Hahsler, A Probabilistic Comparison of Commonly Used Interest Measures for Association Rules,
+2015, URL: https://mhahsler.github.io/arules/docs/measures
+
+## Inclusion
+
+Inclusion is defined as the ratio between the number of attributes of the rule
+and all attributes in the database.
+
+```math
+inclusion(X \implies Y) = \frac{|X \cup Y|}{m},
+```
+
+where $`m`$ is the total number of attributes in the transactional database.
+
+
+**Range:** $`[0, 1]`$
+
+**Reference:** I. Fister Jr., V. Podgorelec, I. Fister. Improved Nature-Inspired Algorithms for Numeric Association
+Rule Mining. In: Vasant P., Zelinka I., Weber GW. (eds) Intelligent Computing and Optimization. ICO 2020. Advances in
+Intelligent Systems and Computing, vol 1324. Springer, Cham.
+
+## Amplitude
+
+Amplitude measures the quality of a rule, preferring attributes with smaller intervals.
+
+```math
+ampl(X \implies Y) = 1 - \frac{1}{n}\sum_{k = 1}^{n}{\frac{Ub_k - Lb_k}{max(o_k) - min(o_k)}},
+```
+
+where $`n`$ is the total number of attributes in the rule, $`Ub_k`$ and $`Lb_k`$ are upper and lower
+bounds of the selected attribute, and $`max(o_k)`$ and $`min(o_k)`$ are the maximum and minimum
+feasible values of the attribute $`o_k`$ in the transactional database.
+
+**Range:** $`[0, 1]`$
+
+**Reference:** I. Fister Jr., I. Fister A brief overview of swarm intelligence-based algorithms for numerical
+association rule mining. arXiv preprint arXiv:2010.15524 (2020).
+
+## Interestingness
+
+Interestingness of the rule, defined as:
+
+```math
+interest(X \implies Y) = \frac{supp(X \implies Y)}{supp(X)} \cdot \frac{supp(X \implies Y)}{supp(Y)}
+\cdot (1 - \frac{supp(X \implies Y)}{|D|})
+```
+
+Here, the first part gives us the probability of generating the rule based on the antecedent, the second part
+gives us the probability of generating the rule based on the consequent and the third part is the probability
+that the rule won't be generated. Thus, rules with very high support will be deemed uninteresting.
+
+**Range:** $`[0, 1]`$
+
+**Reference:** I. Fister Jr., I. Fister A brief overview of swarm intelligence-based algorithms for numerical
+association rule mining. arXiv preprint arXiv:2010.15524 (2020).
+
+## Comprehensibility
+
+Comprehensibility of the rule. Rules with fewer attributes in the consequent are more
+comprehensible.
+
+```math
+comp(X \implies Y) = \frac{log(1 + |Y|)}{log(1 + |X \cup Y|)}
+```
+
+**Range:** $`[0, 1]`$
+
+**Reference:** I. Fister Jr., I. Fister A brief overview of swarm intelligence-based algorithms for numerical
+association rule mining. arXiv preprint arXiv:2010.15524 (2020).
+
+## Netconf
+
+The netconf metric evaluates the interestingness of
+association rules depending on the support of the rule and the
+support of the antecedent and consequent of the rule.
+
+```math
+netconf(X \implies Y) = \frac{supp(X \implies Y) - supp(X)supp(Y)}{supp(X)(1 - supp(X))}
+```
+
+**Range:** $`[-1, 1]`$ (Negative values represent negative dependence, positive values represent positive
+dependence and 0 represents independence)
+
+**Reference:** E. V. Altay and B. Alatas, "Sensitivity Analysis of MODENAR Method for Mining of Numeric Association
+Rules," 2019 1st International Informatics and Software Engineering Conference (UBMYK), 2019, pp. 1-6,
+doi: 10.1109/UBMYK48245.2019.8965539.
+
+## Yule's Q
+
+The Yule's Q metric represents the correlation between two possibly related dichotomous events.
+
+```math
+yulesq(X \implies Y) =
+\frac{supp(X \implies Y)supp(\neg X \implies \neg Y) - supp(X \implies \neg Y)supp(\neg X \implies Y)}
+{supp(X \implies Y)supp(\neg X \implies \neg Y) + supp(X \implies \neg Y)supp(\neg X \implies Y)}
+```
+
+**Range:** $`[-1, 1]`$ (-1 reflects total negative association, 1 reflects perfect positive association
+and 0 reflects independence)
+
+**Reference:** E. V. Altay and B. Alatas, "Sensitivity Analysis of MODENAR Method for Mining of Numeric Association
+Rules," 2019 1st International Informatics and Software Engineering Conference (UBMYK), 2019, pp. 1-6,
+doi: 10.1109/UBMYK48245.2019.8965539.
diff --git a/niaarm/rule_list.py b/niaarm/rule_list.py
@@ -83,6 +83,10 @@ def to_csv(self, filename):
             filename (str): File to save the rules to.
 
         """
+        if not self:
+            print("No rules to output")
+            return
+
         with open(filename, "w", newline="") as f:
             writer = csv.writer(f)