From 177405603ce570fe4cb4d5a0efab5efc5c16558c Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 1 May 2020 15:30:18 +0200 Subject: [PATCH 01/18] Add class that converts pandas.DataFrame to VW input format --- python/tests/test_pyvw.py | 32 ++++++++ python/vowpalwabbit/pyvw.py | 145 ++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index 4599ea827d0..f118f19743e 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -2,7 +2,9 @@ from vowpalwabbit import pyvw from vowpalwabbit.pyvw import vw +from vowpalwabbit.pyvw import DataFrameToVW import pytest +import pandas as pd BIT_SIZE = 18 @@ -344,3 +346,33 @@ def check_error_raises(type, argument): """ with pytest.raises(type) as error: argument() + + +def test_oneline_simple_conversion(): + df = pd.DataFrame({"y": [1], "x": [2]}) + conv = DataFrameToVW(df, "y | a") + lines_list = conv.process_df() + first_line = lines_list[0] + assert first_line == "1 | 2" + +def test_oneline_with_column_renaming_and_tag(): + df = pd.DataFrame({"idx":["id_1"], "y":[1], "x":[2]}) + conv = DataFrameToVW(df, "y idx| col_x:x") + lines_list = conv.process_df() + first_line = lines_list[0] + assert first_line == "1 id_1| col_x:2" + +def test_multiple_lines_conversion(): + df = pd.DataFrame({"y": [1, -1], "x":[1, 2]}) + conv = DataFrameToVW(df, "y | x") + lines_list = conv.process_df() + assert lines_list == ["1 | 1", "-1 | 2"] + +def test_oneline_with_multiple_namespaces(): + df = pd.DataFrame({"y":[1], "a":[2], "b":[3]}) + conv = DataFrameToVW(df, "y |FirstNameSpace a |DoubleIt:2 b") + lines_list = conv.process_df() + first_line = lines_list[0] + assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3" + + diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 584d88eeaef..08826e207a2 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -4,6 +4,7 @@ from __future__ import division import pylibvw import warnings +import pandas as pd class SearchTask(): """Search task class""" @@ -1354,3 +1355,147 @@ def get_label(self, label_class=simple_label): simple_label """ return label_class(self) + + + +class DataFrameToVW: + def __init__(self, df, formula): + """ + Parameters + ---------- + df : pandas.DataFrame + The dataframe to convert + formula : str + The formula specifying the VW ouput needed + + Examples + -------- + + >>> from vowpalwabbit import DataFrameToVW + >>> from pandas as pd + >>> df = pd.DataFrame({"y": [0], "x": [1]}) + >>> conv = DataFrameToVW(df, "y | x") + >>> vw_lines = conv.process_df() + + Returns + ------- + self: DataFrameToVW + """ + self.df = df + self.n_rows = df.shape[0] + self.column_names = set(df.columns) + self.formula = formula + + def process_target_space(self, target_space): + """ + Helper function that process the target space. + + Parameters + ---------- + target_space : str + A formula representing the target space : [label] [importance] [base] [tag] + + Raises + ------ + ValueError + If the column specified in the formula does not exist in the dataframe. + + + Returns + ------- + out : pd.Series + The pd.Series of the lines of the feature space + + """ + no_tag = target_space.endswith(" ") + + splitted = target_space.split() + absent_cols = [col not in self.column_names for col in splitted] + if any(absent_cols): + raise ValueError( + "Column(s) '{}' not in data.frame 'df'".format(absent_cols) + ) + + out = pd.Series([""] * self.n_rows) + for (i, col) in enumerate(splitted): + if i == 0: + out += self.df[col].apply(str) + else: + out += " " + self.df[col].apply(str) + + if no_tag: + out += " " + + return out + + def process_feature_space(self, features_space): + """ + Helper function that process the formula for a given features space. + + Parameters + ---------- + features_space : str + The formula that contains the features. A + namespace can optionally be added. + + Raises + ------ + ValueError + If the column specified in the formula does not exist in the dataframe. + + Returns + ------- + out : pd.Series + The pd.Series of the lines of the feature space + + """ + + has_namespace = not features_space.startswith(" ") + if has_namespace: + splitted = features_space.rstrip().split() + namespace, features = splitted[0], splitted[1:] + out = pd.Series([namespace] * self.n_rows) + else: + features = features_space.strip().split() + out = pd.Series([""] * self.n_rows) + + for feature in features: + if ":" in feature: + feature_name, col_name = feature.split(":") + feature_name += ":" + else: + feature_name, col_name = "", feature + if col_name not in self.column_names: + raise ValueError( + "Column '{}' not in data.frame 'df'".format(col_name) + ) + col_str = self.df[col_name].apply(str) + out += " " + feature_name + col_str + out += " " + return out + + def process_df(self): + """ + Convert pandas.DataFrame to a suitable Vowpal Wabbit format + + Parameters + ---------- + + Returns + ------- + list + The list of the VW lines + + """ + splitted_formula = self.formula.split("|") + target_space, features_spaces = splitted_formula[0], splitted_formula[1:] + out = self.process_target_space(target_space) + features_list = [ + self.process_feature_space(features_space) + for features_space in features_spaces + ] + for f in features_list: + out += "|"+ f + return out.str.rstrip().to_list() + + From 76b62e00e68f9bfe60fa7947b225624b4acaa092 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 1 May 2020 15:40:49 +0200 Subject: [PATCH 02/18] fix docstring --- python/vowpalwabbit/pyvw.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 08826e207a2..dfcbed4ce9d 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1359,6 +1359,7 @@ def get_label(self, label_class=simple_label): class DataFrameToVW: + """DataFrameToVW class""" def __init__(self, df, formula): """ Parameters @@ -1366,7 +1367,7 @@ def __init__(self, df, formula): df : pandas.DataFrame The dataframe to convert formula : str - The formula specifying the VW ouput needed + The formula specifying the desired vowpal wabbit input format Examples -------- @@ -1483,8 +1484,8 @@ def process_df(self): Returns ------- - list - The list of the VW lines + out + The list of the lines of the DataFrame in vowpal wabbit input format """ splitted_formula = self.formula.split("|") From 566a81ee35c72573344091b853f95337cf7ad2a9 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 1 May 2020 16:03:34 +0200 Subject: [PATCH 03/18] fix typo in test_pyvw.py --- python/tests/test_pyvw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index f118f19743e..8bf6ef0e532 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -350,7 +350,7 @@ def check_error_raises(type, argument): def test_oneline_simple_conversion(): df = pd.DataFrame({"y": [1], "x": [2]}) - conv = DataFrameToVW(df, "y | a") + conv = DataFrameToVW(df, "y | x") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 | 2" From 3d00f04fa9b89c562f0d8dc4a2721ab6ac6a19af Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 1 May 2020 16:22:23 +0200 Subject: [PATCH 04/18] fix docstring in pyvw.py --- python/vowpalwabbit/pyvw.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index dfcbed4ce9d..c5182f264d5 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1365,7 +1365,7 @@ def __init__(self, df, formula): Parameters ---------- df : pandas.DataFrame - The dataframe to convert + The DataFrame to convert formula : str The formula specifying the desired vowpal wabbit input format @@ -1399,13 +1399,13 @@ def process_target_space(self, target_space): Raises ------ ValueError - If the column specified in the formula does not exist in the dataframe. + If the column specified in the formula does not exist in the dataframe Returns ------- out : pd.Series - The pd.Series of the lines of the feature space + The pd.Series of the lines of the target space """ no_tag = target_space.endswith(" ") @@ -1436,13 +1436,12 @@ def process_feature_space(self, features_space): Parameters ---------- features_space : str - The formula that contains the features. A - namespace can optionally be added. + The formula that contains the features. A namespace can optionally be added Raises ------ ValueError - If the column specified in the formula does not exist in the dataframe. + If the column specified in the formula does not exist in the dataframe Returns ------- @@ -1477,7 +1476,7 @@ def process_feature_space(self, features_space): def process_df(self): """ - Convert pandas.DataFrame to a suitable Vowpal Wabbit format + Convert pandas.DataFrame to a suitable vowpal wabbit input format Parameters ---------- From 55043af4e2d69ab14d326b0da3d24d27373c263d Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 1 May 2020 19:45:21 +0200 Subject: [PATCH 05/18] add test to DataFrameToVW to test conversion when no target is present. Fix code style --- python/tests/test_pyvw.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index 8bf6ef0e532..b2cc245323e 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -349,30 +349,36 @@ def check_error_raises(type, argument): def test_oneline_simple_conversion(): - df = pd.DataFrame({"y": [1], "x": [2]}) + df = pd.DataFrame({"y": [1], "x": [2]}) conv = DataFrameToVW(df, "y | x") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 | 2" def test_oneline_with_column_renaming_and_tag(): - df = pd.DataFrame({"idx":["id_1"], "y":[1], "x":[2]}) + df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DataFrameToVW(df, "y idx| col_x:x") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 id_1| col_x:2" def test_multiple_lines_conversion(): - df = pd.DataFrame({"y": [1, -1], "x":[1, 2]}) + df = pd.DataFrame({"y": [1, -1], "x": [1, 2]}) conv = DataFrameToVW(df, "y | x") lines_list = conv.process_df() assert lines_list == ["1 | 1", "-1 | 2"] def test_oneline_with_multiple_namespaces(): - df = pd.DataFrame({"y":[1], "a":[2], "b":[3]}) + df = pd.DataFrame({"y": [1], "a": [2], "b": [3]}) conv = DataFrameToVW(df, "y |FirstNameSpace a |DoubleIt:2 b") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3" +def test_oneline_without_target(): + df = pd.DataFrame({"a": [2], "b": [3]}) + conv = DataFrameToVW(df, "| a b") + lines_list = conv.process_df() + first_line = lines_list[0] + assert first_line == "| 2 3" From 31aa448eb119ebf087c835372b4e41682cdce7bb Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Tue, 5 May 2020 21:33:02 +0200 Subject: [PATCH 06/18] specify col in formula using {}, enable more freedom in formatting, check for absent cols at initialization, change formulas in tests --- python/tests/test_pyvw.py | 14 +++-- python/vowpalwabbit/pyvw.py | 116 +++++++++--------------------------- 2 files changed, 38 insertions(+), 92 deletions(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index b2cc245323e..f3053ce8a30 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -350,34 +350,38 @@ def check_error_raises(type, argument): def test_oneline_simple_conversion(): df = pd.DataFrame({"y": [1], "x": [2]}) - conv = DataFrameToVW(df, "y | x") + conv = DataFrameToVW(df, "{y} | {x}") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 | 2" + def test_oneline_with_column_renaming_and_tag(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) - conv = DataFrameToVW(df, "y idx| col_x:x") + conv = DataFrameToVW(df, "{y} {idx}| col_x:{x}") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 id_1| col_x:2" + def test_multiple_lines_conversion(): df = pd.DataFrame({"y": [1, -1], "x": [1, 2]}) - conv = DataFrameToVW(df, "y | x") + conv = DataFrameToVW(df, "{y} | {x}") lines_list = conv.process_df() assert lines_list == ["1 | 1", "-1 | 2"] + def test_oneline_with_multiple_namespaces(): df = pd.DataFrame({"y": [1], "a": [2], "b": [3]}) - conv = DataFrameToVW(df, "y |FirstNameSpace a |DoubleIt:2 b") + conv = DataFrameToVW(df, "{y} |FirstNameSpace {a} |DoubleIt:2 {b}") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3" + def test_oneline_without_target(): df = pd.DataFrame({"a": [2], "b": [3]}) - conv = DataFrameToVW(df, "| a b") + conv = DataFrameToVW(df, "| {a} {b}") lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "| 2 3" diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index c5182f264d5..d54d272ab83 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -5,6 +5,7 @@ import pylibvw import warnings import pandas as pd +import re class SearchTask(): """Search task class""" @@ -1357,9 +1358,11 @@ def get_label(self, label_class=simple_label): return label_class(self) - class DataFrameToVW: """DataFrameToVW class""" + + re_parse_col = re.compile(pattern="{([^{}]*)}") + def __init__(self, df, formula): """ Parameters @@ -1385,94 +1388,29 @@ def __init__(self, df, formula): self.df = df self.n_rows = df.shape[0] self.column_names = set(df.columns) - self.formula = formula + self.formula = re.sub("\s+", " ", formula).strip() + self.check_absent_cols() - def process_target_space(self, target_space): + def check_absent_cols(self): """ - Helper function that process the target space. - - Parameters - ---------- - target_space : str - A formula representing the target space : [label] [importance] [base] [tag] - + Helper function that check if any of the column specified in the formula is missing. + The function raises value error if any of the column is absent. + Raises ------ ValueError If the column specified in the formula does not exist in the dataframe - - Returns - ------- - out : pd.Series - The pd.Series of the lines of the target space - """ - no_tag = target_space.endswith(" ") - splitted = target_space.split() - absent_cols = [col not in self.column_names for col in splitted] + all_cols = self.re_parse_col.findall(self.formula) + absent_cols = [col for col in all_cols if col not in self.column_names] if any(absent_cols): raise ValueError( - "Column(s) '{}' not in data.frame 'df'".format(absent_cols) - ) - - out = pd.Series([""] * self.n_rows) - for (i, col) in enumerate(splitted): - if i == 0: - out += self.df[col].apply(str) - else: - out += " " + self.df[col].apply(str) - - if no_tag: - out += " " - - return out - - def process_feature_space(self, features_space): - """ - Helper function that process the formula for a given features space. - - Parameters - ---------- - features_space : str - The formula that contains the features. A namespace can optionally be added - - Raises - ------ - ValueError - If the column specified in the formula does not exist in the dataframe - - Returns - ------- - out : pd.Series - The pd.Series of the lines of the feature space - - """ - - has_namespace = not features_space.startswith(" ") - if has_namespace: - splitted = features_space.rstrip().split() - namespace, features = splitted[0], splitted[1:] - out = pd.Series([namespace] * self.n_rows) - else: - features = features_space.strip().split() - out = pd.Series([""] * self.n_rows) - - for feature in features: - if ":" in feature: - feature_name, col_name = feature.split(":") - feature_name += ":" - else: - feature_name, col_name = "", feature - if col_name not in self.column_names: - raise ValueError( - "Column '{}' not in data.frame 'df'".format(col_name) + "Column(s) {} not in the DataFrame".format( + str(absent_cols)[1:-1] ) - col_str = self.df[col_name].apply(str) - out += " " + feature_name + col_str - out += " " - return out + ) def process_df(self): """ @@ -1487,15 +1425,19 @@ def process_df(self): The list of the lines of the DataFrame in vowpal wabbit input format """ - splitted_formula = self.formula.split("|") - target_space, features_spaces = splitted_formula[0], splitted_formula[1:] - out = self.process_target_space(target_space) - features_list = [ - self.process_feature_space(features_space) - for features_space in features_spaces - ] - for f in features_list: - out += "|"+ f - return out.str.rstrip().to_list() + matches = list(self.re_parse_col.finditer(self.formula)) + out = pd.Series([""] * self.n_rows) + + current_pos = 0 + for match in matches: + col_name = match.group()[1:-1] + start_pos, end_pos = match.span() + str_part = self.formula[current_pos:start_pos] + value_part = self.df[col_name].apply(str) + out += str_part + value_part + current_pos = end_pos + out += self.formula[current_pos : len(self.formula)] + + return out.to_list() From a55705e6c5e443b163203928b74f057442682a5e Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Wed, 6 May 2020 00:09:31 +0200 Subject: [PATCH 07/18] add check formula conformity + fix docstring. Add test for absent columns --- python/tests/test_pyvw.py | 6 ++++ python/vowpalwabbit/pyvw.py | 56 +++++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index f3053ce8a30..c743fc502ac 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -386,3 +386,9 @@ def test_oneline_without_target(): first_line = lines_list[0] assert first_line == "| 2 3" + +def test_absent_col_error(): + with pytest.raises(ValueError) as value_error: + df = pd.DataFrame({"a": [1]}) + conv = DataFrameToVW(df, "{a} | {b} {c}") + assert "Column(s) 'b', 'c' not in the DataFrame" == str(value_error.value) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index d54d272ab83..4947715fff1 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1363,14 +1363,38 @@ class DataFrameToVW: re_parse_col = re.compile(pattern="{([^{}]*)}") + feature_name_pattern = "(?:\w+[:*])" + feature_value_pattern = "{[^{}]+}" + const_value_pattern = "\w+" + before_words, words, after_words = ( + "\s*\|?\s*", + "(?:{[^{}]+}|[\w:*]+)", + "\s*", + ) + re_check_formula = re.compile( + "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format( + feature_name_pattern, feature_value_pattern, const_value_pattern + ) + ) + def __init__(self, df, formula): """ + Convert a pandas DataFrame to the vowpal wabbit format defined by the user in formula parameter. + Formula is a string where the feature value of a given column is specified using + the curly braces syntax (e.g: {name_of_the_column}). The part of the formula not specified + in curly braces will be considered constant and repeated on each line. See examples + for more details. + + The following column names cannot be used in the formula : + - column names that contain the character '{' or '}' + - the empty string '' + Parameters ---------- df : pandas.DataFrame The DataFrame to convert formula : str - The formula specifying the desired vowpal wabbit input format + The formula specifying the desired vowpal wabbit input format. Examples -------- @@ -1378,8 +1402,12 @@ def __init__(self, df, formula): >>> from vowpalwabbit import DataFrameToVW >>> from pandas as pd >>> df = pd.DataFrame({"y": [0], "x": [1]}) - >>> conv = DataFrameToVW(df, "y | x") + >>> conv = DataFrameToVW(df, "{y} | {x}") >>> vw_lines = conv.process_df() + + >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]}) + >>> conv2 = DataFrameToVW(df, '{y} |AllFeatures {x} {z}') + >>> vw_lines2 = conv.process_df() Returns ------- @@ -1389,8 +1417,24 @@ def __init__(self, df, formula): self.n_rows = df.shape[0] self.column_names = set(df.columns) self.formula = re.sub("\s+", " ", formula).strip() + self.check_formula() self.check_absent_cols() + def check_formula(self): + """ + Check if formula is of appropriate format + """ + match = self.re_check_formula.match(self.formula) + valid_formula = match.group() == self.formula + if not valid_formula: + valid_part = self.formula[: match.end()] + invalid_part = self.formula[match.end() :] + raise ValueError( + "Error parsing formula.\nValid: '{}'\nNot valid: '{}'".format( + valid_part, invalid_part + ) + ) + def check_absent_cols(self): """ Helper function that check if any of the column specified in the formula is missing. @@ -1406,19 +1450,15 @@ def check_absent_cols(self): all_cols = self.re_parse_col.findall(self.formula) absent_cols = [col for col in all_cols if col not in self.column_names] if any(absent_cols): + absent_cols_str = str(absent_cols)[1:-1] raise ValueError( - "Column(s) {} not in the DataFrame".format( - str(absent_cols)[1:-1] - ) + "Column(s) {} not in the DataFrame".format(absent_cols_str) ) def process_df(self): """ Convert pandas.DataFrame to a suitable vowpal wabbit input format - Parameters - ---------- - Returns ------- out From 4d223559a19ca1e498a9a55c68b2b716bfe576e0 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Wed, 6 May 2020 00:23:23 +0200 Subject: [PATCH 08/18] fix pattern to allow decimal value --- python/vowpalwabbit/pyvw.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 4947715fff1..3510c2f9b60 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1365,12 +1365,7 @@ class DataFrameToVW: feature_name_pattern = "(?:\w+[:*])" feature_value_pattern = "{[^{}]+}" - const_value_pattern = "\w+" - before_words, words, after_words = ( - "\s*\|?\s*", - "(?:{[^{}]+}|[\w:*]+)", - "\s*", - ) + const_value_pattern = "[\w.]+" re_check_formula = re.compile( "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format( feature_name_pattern, feature_value_pattern, const_value_pattern From 66de092954a9bf0765597585b67deeed8d333b03 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Wed, 6 May 2020 00:59:06 +0200 Subject: [PATCH 09/18] fix typo in docstring of DataFrameToVW.__init__ --- python/vowpalwabbit/pyvw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 3510c2f9b60..94af4c9a8b3 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1401,8 +1401,8 @@ def __init__(self, df, formula): >>> vw_lines = conv.process_df() >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]}) - >>> conv2 = DataFrameToVW(df, '{y} |AllFeatures {x} {z}') - >>> vw_lines2 = conv.process_df() + >>> conv2 = DataFrameToVW(df2, '{y} |AllFeatures {x} {z}') + >>> vw_lines2 = conv2.process_df() Returns ------- From 13a4441f18c02087923262a6ff26be3e5ede5f15 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Wed, 13 May 2020 18:45:24 +0200 Subject: [PATCH 10/18] create class based formula for the conversion of datafame to vw input format --- python/tests/test_pyvw.py | 85 ++++- python/vowpalwabbit/pyvw.py | 703 ++++++++++++++++++++++++++++++++---- 2 files changed, 695 insertions(+), 93 deletions(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index c743fc502ac..c6a49f70408 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -2,7 +2,7 @@ from vowpalwabbit import pyvw from vowpalwabbit.pyvw import vw -from vowpalwabbit.pyvw import DataFrameToVW +from vowpalwabbit.pyvw import DataFrameToVW, SimpleLabel, Feature, Namespace, Col import pytest import pandas as pd @@ -347,48 +347,95 @@ def check_error_raises(type, argument): with pytest.raises(type) as error: argument() - -def test_oneline_simple_conversion(): +def test_from_colnames_constructor(): df = pd.DataFrame({"y": [1], "x": [2]}) - conv = DataFrameToVW(df, "{y} | {x}") + conv = DFtoVW.from_colnames(y="y", x=["x"], df=df) lines_list = conv.process_df() first_line = lines_list[0] assert first_line == "1 | 2" -def test_oneline_with_column_renaming_and_tag(): +def test_feature_column_renaming_and_tag(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) - conv = DataFrameToVW(df, "{y} {idx}| col_x:{x}") - lines_list = conv.process_df() - first_line = lines_list[0] + conv = DFtoVW( + label=SimpleLabel(Col("y")), + tag=SimpleLabel(Col("idx")), + namespaces=Namespace([Feature(name="col_x", value=Col("x"))]), + df=df, + ) + first_line = conv.process_df()[0] assert first_line == "1 id_1| col_x:2" +def test_feature_constant_column_with_empty_name(): + df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) + conv = DFtoVW( + label=SimpleLabel(Col("y")), + tag=SimpleLabel(Col("idx")), + namespaces=Namespace([Feature(name="", value=2)]), + df=df, + ) + first_line = conv.process_df()[0] + assert first_line == "1 id_1| :2" + + +def test_feature_variable_column_name(): + df = pd.DataFrame({"y": [1], "x": [2], "a": ["col_x"]}) + conv = DFtoVW( + label=SimpleLabel(Col("y")), + namespaces=Namespace(Feature(name=Col("a"), value=Col("x"))), + df=df, + ) + first_line = conv.process_df()[0] + assert first_line == "1 | col_x:2" + + def test_multiple_lines_conversion(): df = pd.DataFrame({"y": [1, -1], "x": [1, 2]}) - conv = DataFrameToVW(df, "{y} | {x}") + conv = DFtoVW( + label=SimpleLabel(Col("y")), + namespaces=Namespace(Feature(value=Col("x"))), + df=df, + ) lines_list = conv.process_df() assert lines_list == ["1 | 1", "-1 | 2"] -def test_oneline_with_multiple_namespaces(): +def test_multiple_namespaces(): df = pd.DataFrame({"y": [1], "a": [2], "b": [3]}) - conv = DataFrameToVW(df, "{y} |FirstNameSpace {a} |DoubleIt:2 {b}") - lines_list = conv.process_df() - first_line = lines_list[0] + conv = DFtoVW( + df=df, + label=SimpleLabel(Col("y")), + namespaces=[ + Namespace(name="FirstNameSpace", features=Feature(Col("a"))), + Namespace(name="DoubleIt", value=2, features=Feature(Col("b"))), + ], + ) + first_line = conv.process_df()[0] assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3" -def test_oneline_without_target(): +def test_without_target(): df = pd.DataFrame({"a": [2], "b": [3]}) - conv = DataFrameToVW(df, "| {a} {b}") - lines_list = conv.process_df() - first_line = lines_list[0] + conv = DFtoVW( + df=df, namespaces=Namespace([Feature(Col("a")), Feature(Col("b"))]) + ) + first_line = conv.process_df()[0] assert first_line == "| 2 3" def test_absent_col_error(): with pytest.raises(ValueError) as value_error: df = pd.DataFrame({"a": [1]}) - conv = DataFrameToVW(df, "{a} | {b} {c}") - assert "Column(s) 'b', 'c' not in the DataFrame" == str(value_error.value) + conv = DFtoVW( + df=df, + label=SimpleLabel(Col("b")), + namespaces=Namespace( + [Feature(Col("b")), Feature(Col("c")), Feature("d")] + ), + ) + expected = "The following columns do not exist in the dataframe: '{}', '{}'".format( + "b", "c" + ) + assert expected == str(value_error.value) + diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 94af4c9a8b3..62ac1af8bb8 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -5,7 +5,8 @@ import pylibvw import warnings import pandas as pd -import re +import abc +import collections class SearchTask(): """Search task class""" @@ -1358,121 +1359,675 @@ def get_label(self, label_class=simple_label): return label_class(self) -class DataFrameToVW: - """DataFrameToVW class""" +class Col: + """Col is a convenience class to refer to a column of a dataframe. + Its methods can: + - check if the column is in a specified dataframe + - extract the column from the specified dataframe + """ + + def __init__(self, colname): + """ + Initialize a Col object + + Parameters + ---------- + + colname : str + The colname that refers to a column + + Raises + ------ + + TypeError + If attribute 'colname' is not a string + + Returns + ------- + + self : Col + + """ + if isinstance(colname, str): + self.colname = colname + else: + raise TypeError("'colname' must be a string") + + def col_exist(self, df): + """ + Check if the column 'colname' is in a dataframe 'df' + + Parameters + ---------- + + df : pandas.DataFrame + The dataframe in which to look for the column + + Returns + ------- + + bool + True if the column is in the dataframe, False otherwise. + + """ + return self.colname in df + + def get_col(self, df): + """ + Extract the column 'colname' from the dataframe 'df' + + Parameters + ---------- + + df : pandas.DataFrame + The dataframe from which to extract the column 'colname' + + Raises + ------ + + KeyError + If the column is not found in the dataframe. + + Returns + ------- + + out : pandas.Series + The column extracted from the dataframe. + + """ + try: + out = df[self.colname].fillna("").apply(str) + except KeyError: + raise KeyError( + "Column '{}' not found in dataframe".format(self.colname) + ) + else: + return out + - re_parse_col = re.compile(pattern="{([^{}]*)}") +class FormulaType(abc.ABC): + """ + The FormulaType class in an abstract class from which to subclasses the + types that will be used in the DFtoVW class. + The method 'process' is abstract and must be implemented in the subclass. + The class has two concrete implementations 'check_type' and + 'get_col_or_value'. They are helper functions that are used in subclasses + to check the type of the parameters passed when initializing objects and + to handle the values passed that can be either a literal (int/float/str) or + a Col object. + """ + + @abc.abstractmethod + def process(self, df): + """ + Abstract method that build the subclasses Feature/SimpleLabel/Namespace + string representation. If the subclasses are initialized using Col + object(s), the result will be a column, otherwise it will be a string. + + Parameters + ---------- + df : pandas.DataFrame, optional + The dataframe from which to extract column. + + """ - feature_name_pattern = "(?:\w+[:*])" - feature_value_pattern = "{[^{}]+}" - const_value_pattern = "[\w.]+" - re_check_formula = re.compile( - "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format( - feature_name_pattern, feature_value_pattern, const_value_pattern + def get_col_or_value(self, x, df): + """Returns the column 'colname' from dataframe 'df' if x is a Col + object else returns the value of x convert to string. + + Parameters + ---------- + + x : Col/str/int/float + The Col object or a literal value (str/int/float). + df : pandas.DataFrame + The dataframe in which to extract the column. + + Returns + ------- + out : str or pandas.Series + A pandas.Series if x is of type 'Col' and a string. + """ + try: + out = x.get_col(df) + except AttributeError: + out = str(x) + return out + + def get_all_cols(self): + """ + Returns attributes of an instance that are of type Col. Note that this + method search for Col just in the attributes of the instances and will + not search in the attributes of the attributes (no recursive search). + + Returns + ------- + + out : list of Col + The list of Col objects in the instance + + """ + attr_values = list(vars(self).values()) + out = [x for x in attr_values if isinstance(x, Col)] + return out + + def check_type(self, arg_name, arg_value): + """ + Check if the type of an argument is valid. Typically used in the + __init__ method of the subclasses to check arguments conformity. + Using this method requires that a dict 'expected_type' exists in the + class of the calling instance. + + Parameters + ---------- + + arg_name : str + The argument name. + arg_value : object + The argument value to check. + + Raises + ------ + + TypeError + If the argument is not of a valid type. + """ + expected_type_str = str( + [x.__name__ for x in self.expected_type[arg_name]] ) + if arg_value is not None: + if not isinstance(arg_value, self.expected_type[arg_name]): + raise TypeError( + "Parameter {} shoud be of class {}".format( + arg_name, expected_type_str[1:-1] + ) + ) + + +class SimpleLabel(FormulaType): + """The SimpleLabel class is used to build a simple label that will be plug + to build the parameters of the DFtoVW class. + """ + + expected_type = dict(name=(Col, str, int, float)) + + def __init__(self, name): + """ + Initialize a SimpleLabel instance. + + Parameters + ---------- + + name : Col/int/float/str + A Col object specifying the column to extract from a dataframe or a + constant value of type int/float/str. + + Returns + ------- + + self : SimpleLabel + + """ + super().check_type("name", name) + self.name = name + + def process(self, df): + """ + Returns the SimpleLabel string representation. + + Parameters + ---------- + + df: pandas.DataFrame + The dataframe from which to extract a column. + + Returns + ------- + + str or pandas.Series + The SimpleLabel string representation. + """ + return super().get_col_or_value(self.name, df) + + +class Feature(FormulaType): + """The Feature class is used to build a feature for the DFtoVW class""" + + expected_type = dict( + name=(Col, str, float, int), + value=(Col, str, float, int) ) - def __init__(self, df, formula): + def __init__(self, value, name=None): """ - Convert a pandas DataFrame to the vowpal wabbit format defined by the user in formula parameter. - Formula is a string where the feature value of a given column is specified using - the curly braces syntax (e.g: {name_of_the_column}). The part of the formula not specified - in curly braces will be considered constant and repeated on each line. See examples - for more details. + Initialize a Feature instance. + + Parameters + ---------- + + value : str/float/int or Col + The value of the feature. Can be a literal or a Col object. + name : str/float/int or Col, optional + The name of the feature. Can be constant value or a Col object. + + Returns + ------- + + self : Feature + + """ + super().check_type("name", name) + super().check_type("value", value) + self.name = name + self.value = value + + def process(self, df): + """ + Returns the Feature string representation. - The following column names cannot be used in the formula : - - column names that contain the character '{' or '}' - - the empty string '' - Parameters ---------- + df : pandas.DataFrame - The DataFrame to convert - formula : str - The formula specifying the desired vowpal wabbit input format. + The dataframe from which to extract a column + + Returns + ------- + + out : str or pandas.Series + The Feature string representation + + """ + value_col = super().get_col_or_value(self.value, df) + if self.name is None: + out = value_col + else: + name_col = super().get_col_or_value(self.name, df) + out = name_col + ":" + value_col + return out + + +class Namespace(FormulaType): + """The Namespace class is used to build a namespace for the DFtoVW class. + The Namespace is a container for Feature object(s). Hence, it must + be composed of a Feature object or a list of Feature objects. + """ + + expected_type = dict( + name=(str, int, float), + value=(int, float), + features=(list, Feature), + ) + + def __init__(self, features, name=None, value=None): + """ + Initialize a Namespace instance. + + Parameters + ---------- + + features : Feature or list of Feature + A (list of) Feature object(s) that will form the namespace + name : str/int/float, optional + The name of the namespace + value : int/float, optional + A constant that specify the scaling factor for the features of this + namespace. Examples -------- - >>> from vowpalwabbit import DataFrameToVW + >>> from pyvw import Namespace, Feature + >>> ns_one_feature = Namespace(Feature(Col("a"))) + >>> ns_multi_features = Namespace([Feature(Col("a")), Feature(Col("b"))]) + >>> ns_one_feature_with_name = Namespace(Feature(Col("a")), + name="FirstNamespace") + + Returns + ------- + + self: Namespace + + """ + super().check_type("name", name) + super().check_type("value", value) + super().check_type("features", features) + + self.features = ( + list(features) if isinstance(features, (list, set)) else [features] + ) + if (value is not None) and (name is None): + raise ValueError( + "Namespace cannot have a 'value' argument without a 'name' argument" + ) + self.name = name + if value is not None: + value = str(value) + self.value = value + + def process(self, df=None): + """ + Returns the Namespace string representation + """ + out = ["|"] + if self.name is not None: + out += str(self.name) + if self.value is not None: + out += [":", str(self.value)] + + return "".join(out) + + +class DFtoVW: + """ + The DFtoVW is used to convert a pandas DataFrame to a suitable VW format. + Instances of this class are build using Col object(s) and subclasses of + FormulaType such as SimpleLabel, Feature or Namespace. + The class also provided a convenience constructor to initialize the class + based on the target/features columns names only. + """ + + def __init__(self, df, namespaces, + label=None, tag=None, base=None, importance=None): + """ + Initialize a DFtoVW instance + + Parameters + ---------- + + df : pandas.DataFrame + The dataframe to convert to VW input format. + namespaces : list of Namespace/Namespace + One or more Namespace object(s), each of being composed of one or + more Feature object(s). + label : SimpleLabel + The label is the real numbers to be predicted for the examples. + importance : SimpleLabel + The importance (weight) indicating the relative importance of the + examples. + tag : SimpleLabel + The tag that is used as identifiers for the examples. + base : SimpleLabel + The base added to the prediction before computing an update. + + Examples + -------- + + >>> from vowpalwabbit.pyvw import DFtoVW >>> from pandas as pd - >>> df = pd.DataFrame({"y": [0], "x": [1]}) - >>> conv = DataFrameToVW(df, "{y} | {x}") - >>> vw_lines = conv.process_df() - - >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]}) - >>> conv2 = DataFrameToVW(df2, '{y} |AllFeatures {x} {z}') - >>> vw_lines2 = conv2.process_df() + >>> df = pd.DataFrame({"y": [1], "a": [2], "b": [3]}) + >>> conv1 = DFtoVW(df=df, + label=SimpleLabel(Col("y")), + namespaces=Namespace(Feature(name="feat_a", value=Col("a")))) + >>> conv1.process_df() + + >>> conv2 = DFtoVW(df=df, + label=SimpleLabel(Col("y")), + namespaces=Namespace( + name="DoubleIt", value=2, + features=Feature(name="feat_a", value=Col("a")))) + >>> conv2.process_df() + + >>> conv3 = DFtoVW(df=df, + label=SimpleLabel(Col("y")), + namespaces=[Namespace(name="NS1", features=Feature(Col("a"))), + Namespace(name="NS2", features=Feature(Col("b")))]) + >>>conv3.process_df() Returns ------- - self: DataFrameToVW + + self : DFtoVW """ self.df = df self.n_rows = df.shape[0] - self.column_names = set(df.columns) - self.formula = re.sub("\s+", " ", formula).strip() - self.check_formula() - self.check_absent_cols() + self.targets = collections.OrderedDict( + label=label, importance=importance, base=base, tag=tag + ) + self.no_tag = tag is not None + self.namespaces = ( + list(namespaces) + if isinstance(namespaces, (list, set)) + else [namespaces] + ) + self.check_targets_type() + self.check_namespaces_type() + self.check_features_type() + self.check_if_cols_exist() + self.out = self.empty_col() + + @classmethod + def from_colnames(cls, y, x, df, cbb_label=False): + """Simple interface to building formula. + + + Parameters + ---------- + cls : DFtoVW + DFtoVW will be initialized using the arguments of this simpler + interface + y : str/list + The column(s) for the label(s) + x : str/list + The column(s) for the feature(s) + df : pandas.DataFrame + The dataframe used + cbb_label : bool, optional + Should be set to True if the label represent contextual bandit + label. + The default is False. + + Raises + ------ + + TypeError + DESCRIPTION. + + Examples + -------- + + >>> from vowpalwabbit.pyvw import DFtoVW + >>> from pandas as pd + >>> df = pd.DataFrame({"y": [1], "x": [2]}) + >>> conv = DFtoVW.from_colnames(y="y", x="x") + >>> conv.process_df() + + Returns + ------- + + DFtoVW + A initialized DFtoVW instance. - def check_formula(self): """ - Check if formula is of appropriate format + + if isinstance(y, list) and not cbb_label: + if len(y) == 1: + y = y[0] + else: + raise ValueError( + "Parameter should be a string or a list of one string" + ) + label = SimpleLabel(Col(y)) + x = list(x) if isinstance(x, (list, set)) else [x] + namespaces = Namespace( + features=[Feature(value=Col(colname)) for colname in x] + ) + return cls(namespaces=namespaces, label=label, df=df) + + def check_targets_type(self): """ - match = self.re_check_formula.match(self.formula) - valid_formula = match.group() == self.formula - if not valid_formula: - valid_part = self.formula[: match.end()] - invalid_part = self.formula[match.end() :] - raise ValueError( - "Error parsing formula.\nValid: '{}'\nNot valid: '{}'".format( - valid_part, invalid_part + Check targets arguments (label, tag, importance, base) conformity + + Raises + ------ + TypeError + If any of the targets element is not of type SimpleLabel + + """ + wrong_type_targets = [ + key + for (key, value) in self.targets.items() + if not isinstance(value, SimpleLabel) and value is not None + ] + if wrong_type_targets: + raise TypeError( + "Parameter(s) {} must be of type 'SimpleLabel'".format( + str(wrong_type_targets)[1:-1] ) ) - def check_absent_cols(self): + def check_namespaces_type(self): """ - Helper function that check if any of the column specified in the formula is missing. - The function raises value error if any of the column is absent. - + Check namespaces arguments conformity + Raises ------ - ValueError - If the column specified in the formula does not exist in the dataframe + TypeError + If parameters namespaces is not of type Namespace """ + wrong_type_namespaces = [ + not isinstance(namespace, Namespace) + for namespace in self.namespaces + ] + if any(wrong_type_namespaces): + raise TypeError( + "Parameter namespaces must be a (list of) Namespace object(s)" + ) - all_cols = self.re_parse_col.findall(self.formula) - absent_cols = [col for col in all_cols if col not in self.column_names] - if any(absent_cols): - absent_cols_str = str(absent_cols)[1:-1] - raise ValueError( - "Column(s) {} not in the DataFrame".format(absent_cols_str) + def check_features_type(self): + """ + Check if elements of namespaces are of type features + + Raises + ------ + + TypeError + If parameters any of the element in a 'Namespace' is not of type + 'Feature' + + """ + for ns in self.namespaces: + features = ns.features + wrong_type_features = [ + not isinstance(feature, Feature) for feature in features + ] + if any(wrong_type_features): + raise TypeError( + "Elements of 'Namespace' object must be of type 'Feature'" + ) + + def check_if_cols_exist(self): + """ + Check if some columns specified are not in dataframe. + + Raises + ------ + + ValueError + If some columns are not in the dataframe. + + """ + absent_cols = [] + + targets_not_none = [ + target for target in self.targets.values() if target is not None + ] + for target in targets_not_none: + absent_cols += [ + x.colname + for x in target.get_all_cols() + if not x.col_exist(self.df) + ] + + for ns in self.namespaces: + for feature in ns.features: + absent_cols += [ + x.colname + for x in feature.get_all_cols() + if not x.col_exist(self.df) + ] + + unique_absent_cols = sorted(list(set(absent_cols))) + if len(absent_cols) > 0: + msg_error = "The following columns do not exist in the dataframe: {}".format( + str(unique_absent_cols)[1:-1] ) + raise ValueError(msg_error) - def process_df(self): + def empty_col(self): """ - Convert pandas.DataFrame to a suitable vowpal wabbit input format + Create an empty string pandas column. Returns ------- - out - The list of the lines of the DataFrame in vowpal wabbit input format + + pandas.Series + A column of empty string with as much rows as the input dataframe. """ - matches = list(self.re_parse_col.finditer(self.formula)) - out = pd.Series([""] * self.n_rows) + return pd.Series([""] * self.n_rows) - current_pos = 0 - for match in matches: - col_name = match.group()[1:-1] - start_pos, end_pos = match.span() - str_part = self.formula[current_pos:start_pos] - value_part = self.df[col_name].apply(str) - out += str_part + value_part - current_pos = end_pos - out += self.formula[current_pos : len(self.formula)] + def process_targets(self): + """ + Process the targets into a unique pandas column - return out.to_list() + Returns + ------- + + out : pandas.Series + A column where each row is the processed targets + + """ + out = self.empty_col() + + for name, value in self.targets.items(): + if value is not None: + to_add = value.process(self.df) + out += to_add if (name == "label") else (" " + to_add) + elif (value is None) and (name == "tag"): + out += " " + return out + + def process_features(self, features): + """ + Process the features (of a namespace) into a unique pandas column + + Parameters + ---------- + features : list of Feature + The list of Feature objects + + Returns + ------- + out : pandas.series + The column of the processed features + + """ + out = self.empty_col() + for feature in features: + out += " " + feature.process(self.df) + return out + + def process_df(self): + """ + Main method that do the conversion of the dataframe to the VW format + + Returns + ------- + list + The list of parsed lines in VW format + """ + if not all(x is None for x in self.targets.values()): + self.out += self.process_targets() + + for (num_ns, ns_obj) in enumerate(self.namespaces): + to_add = ns_obj.process() + self.process_features(ns_obj.features) + self.out += ( + (to_add + " ") + if (num_ns < len(self.namespaces) - 1) + else to_add + ) + return self.out.to_list() From 895615178cf56feaaa9be4c27df06f735918acc8 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 15 May 2020 17:35:46 +0200 Subject: [PATCH 11/18] remove abc class, did simple functions instead of inheriting from FormulaType --- python/vowpalwabbit/pyvw.py | 171 +++++++++++++++--------------------- 1 file changed, 72 insertions(+), 99 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 62ac1af8bb8..6412794e236 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -5,7 +5,6 @@ import pylibvw import warnings import pandas as pd -import abc import collections class SearchTask(): @@ -1445,106 +1444,79 @@ def get_col(self, df): return out -class FormulaType(abc.ABC): - """ - The FormulaType class in an abstract class from which to subclasses the - types that will be used in the DFtoVW class. - The method 'process' is abstract and must be implemented in the subclass. - The class has two concrete implementations 'check_type' and - 'get_col_or_value'. They are helper functions that are used in subclasses - to check the type of the parameters passed when initializing objects and - to handle the values passed that can be either a literal (int/float/str) or - a Col object. - """ +def get_col_or_value(x, df): + """Returns the column 'colname' from dataframe 'df' if x is a Col + object else returns the value of x convert to string. - @abc.abstractmethod - def process(self, df): - """ - Abstract method that build the subclasses Feature/SimpleLabel/Namespace - string representation. If the subclasses are initialized using Col - object(s), the result will be a column, otherwise it will be a string. + Parameters + ---------- - Parameters - ---------- - df : pandas.DataFrame, optional - The dataframe from which to extract column. + x : Col/str/int/float + The Col object or a literal value (str/int/float). + df : pandas.DataFrame + The dataframe in which to extract the column. - """ + Returns + ------- + out : str or pandas.Series + A pandas.Series if x is of type 'Col' and a string. + """ + try: + out = x.get_col(df) + except AttributeError: + out = str(x) + return out - def get_col_or_value(self, x, df): - """Returns the column 'colname' from dataframe 'df' if x is a Col - object else returns the value of x convert to string. - Parameters - ---------- - - x : Col/str/int/float - The Col object or a literal value (str/int/float). - df : pandas.DataFrame - The dataframe in which to extract the column. - - Returns - ------- - out : str or pandas.Series - A pandas.Series if x is of type 'Col' and a string. - """ - try: - out = x.get_col(df) - except AttributeError: - out = str(x) - return out +def get_all_cols(obj): + """ + Returns attributes of an instance that are of type Col. Note that this + method search for Col just in the attributes of the instances and will + not search in the attributes of the attributes (no recursive search). - def get_all_cols(self): - """ - Returns attributes of an instance that are of type Col. Note that this - method search for Col just in the attributes of the instances and will - not search in the attributes of the attributes (no recursive search). + Returns + ------- - Returns - ------- + out : list of Col + The list of Col objects in the instance - out : list of Col - The list of Col objects in the instance + """ + attr_values = list(vars(obj).values()) + out = [x for x in attr_values if isinstance(x, Col)] + return out - """ - attr_values = list(vars(self).values()) - out = [x for x in attr_values if isinstance(x, Col)] - return out - def check_type(self, arg_name, arg_value): - """ - Check if the type of an argument is valid. Typically used in the - __init__ method of the subclasses to check arguments conformity. - Using this method requires that a dict 'expected_type' exists in the - class of the calling instance. +def check_type(obj, expected_type): + """ + Check if an object is of valid type. - Parameters - ---------- + Parameters + ---------- - arg_name : str - The argument name. - arg_value : object - The argument value to check. + arg : obj + The object to check + expected_type: type or tuple of types + The types to check against - Raises - ------ + Raises + ------ - TypeError - If the argument is not of a valid type. - """ - expected_type_str = str( - [x.__name__ for x in self.expected_type[arg_name]] - ) - if arg_value is not None: - if not isinstance(arg_value, self.expected_type[arg_name]): - raise TypeError( - "Parameter {} shoud be of class {}".format( - arg_name, expected_type_str[1:-1] - ) + TypeError + If the argument is not of a valid type. + """ + expected_type_str = str( + [x.__name__ for x in expected_type] + ) + if obj is not None: + if not isinstance(obj, expected_type): + raise TypeError( + "Parameter {} shoud be of class {}".format( + obj, expected_type_str[1:-1] ) + ) -class SimpleLabel(FormulaType): +class SimpleLabel: """The SimpleLabel class is used to build a simple label that will be plug to build the parameters of the DFtoVW class. """ @@ -1568,7 +1540,7 @@ def __init__(self, name): self : SimpleLabel """ - super().check_type("name", name) + check_type(name, self.expected_type["name"]) self.name = name def process(self, df): @@ -1587,10 +1559,10 @@ def process(self, df): str or pandas.Series The SimpleLabel string representation. """ - return super().get_col_or_value(self.name, df) + return get_col_or_value(self.name, df) -class Feature(FormulaType): +class Feature: """The Feature class is used to build a feature for the DFtoVW class""" expected_type = dict( @@ -1616,8 +1588,8 @@ def __init__(self, value, name=None): self : Feature """ - super().check_type("name", name) - super().check_type("value", value) + for (arg, name_arg) in zip([name, value], ["name", "value"]) : + check_type(arg, self.expected_type[name_arg]) self.name = name self.value = value @@ -1638,16 +1610,16 @@ def process(self, df): The Feature string representation """ - value_col = super().get_col_or_value(self.value, df) + value_col = get_col_or_value(self.value, df) if self.name is None: out = value_col else: - name_col = super().get_col_or_value(self.name, df) + name_col = get_col_or_value(self.name, df) out = name_col + ":" + value_col return out -class Namespace(FormulaType): +class Namespace: """The Namespace class is used to build a namespace for the DFtoVW class. The Namespace is a container for Feature object(s). Hence, it must be composed of a Feature object or a list of Feature objects. @@ -1689,9 +1661,8 @@ def __init__(self, features, name=None, value=None): self: Namespace """ - super().check_type("name", name) - super().check_type("value", value) - super().check_type("features", features) + for (arg, name_arg) in zip([name, value, features], ["name", "value", "features"]) : + check_type(arg, self.expected_type[name_arg]) self.features = ( list(features) if isinstance(features, (list, set)) else [features] @@ -1721,8 +1692,8 @@ def process(self, df=None): class DFtoVW: """ The DFtoVW is used to convert a pandas DataFrame to a suitable VW format. - Instances of this class are build using Col object(s) and subclasses of - FormulaType such as SimpleLabel, Feature or Namespace. + Instances of this class are build using Col object(s) and classes such as + SimpleLabel, Feature or Namespace. The class also provided a convenience constructor to initialize the class based on the target/features columns names only. """ @@ -1847,6 +1818,7 @@ def from_colnames(cls, y, x, df, cbb_label=False): raise ValueError( "Parameter should be a string or a list of one string" ) + label = SimpleLabel(Col(y)) x = list(x) if isinstance(x, (list, set)) else [x] namespaces = Namespace( @@ -1936,7 +1908,7 @@ def check_if_cols_exist(self): for target in targets_not_none: absent_cols += [ x.colname - for x in target.get_all_cols() + for x in get_all_cols(target) if not x.col_exist(self.df) ] @@ -1944,7 +1916,7 @@ def check_if_cols_exist(self): for feature in ns.features: absent_cols += [ x.colname - for x in feature.get_all_cols() + for x in get_all_cols(feature) if not x.col_exist(self.df) ] @@ -2031,3 +2003,4 @@ def process_df(self): return self.out.to_list() + From f4329c3b99a1a333064a487c10b24683e3bbf00c Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 15 May 2020 17:57:13 +0200 Subject: [PATCH 12/18] fix typo on import DFtoVW class --- python/tests/test_pyvw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index c6a49f70408..949126ae6f8 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -2,7 +2,7 @@ from vowpalwabbit import pyvw from vowpalwabbit.pyvw import vw -from vowpalwabbit.pyvw import DataFrameToVW, SimpleLabel, Feature, Namespace, Col +from vowpalwabbit.pyvw import DFtoVW, SimpleLabel, Feature, Namespace, Col import pytest import pandas as pd From d455cce1705efd22183cf810549f7ffc80a3bd2a Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 15 May 2020 18:55:10 +0200 Subject: [PATCH 13/18] handle the different init for OrderedDict in python 2.7 --- python/vowpalwabbit/pyvw.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 6412794e236..cc280a4c029 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1752,9 +1752,10 @@ def __init__(self, df, namespaces, """ self.df = df self.n_rows = df.shape[0] - self.targets = collections.OrderedDict( - label=label, importance=importance, base=base, tag=tag - ) + self.targets = collections.OrderedDict() + for (key, value) in zip(["label", "importance", "base", "tag"], + [label, importance, base, tag]): + self.targets[key] = value self.no_tag = tag is not None self.namespaces = ( list(namespaces) @@ -1832,6 +1833,7 @@ def check_targets_type(self): Raises ------ + TypeError If any of the targets element is not of type SimpleLabel @@ -1967,11 +1969,13 @@ def process_features(self, features): Parameters ---------- + features : list of Feature The list of Feature objects Returns ------- + out : pandas.series The column of the processed features @@ -1987,6 +1991,7 @@ def process_df(self): Returns ------- + list The list of parsed lines in VW format """ From e1f1f567a706662ac321314566796d849ed53687 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Thu, 21 May 2020 15:39:43 +0200 Subject: [PATCH 14/18] clean docstring and fix typos, add undescore for internal function --- python/vowpalwabbit/pyvw.py | 242 ++++++++++++++---------------------- 1 file changed, 96 insertions(+), 146 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index cc280a4c029..9f41c9b2c15 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1359,33 +1359,31 @@ def get_label(self, label_class=simple_label): class Col: - """Col is a convenience class to refer to a column of a dataframe. - Its methods can: + """Refer to a column of a dataframe. + The methods of this class are used to: - check if the column is in a specified dataframe - extract the column from the specified dataframe """ def __init__(self, colname): - """ - Initialize a Col object + """Initialize a Col object. Parameters ---------- colname : str - The colname that refers to a column + The colname that refers to a column. Raises ------ TypeError - If attribute 'colname' is not a string + If attribute 'colname' is not a string. Returns ------- self : Col - """ if isinstance(colname, str): self.colname = colname @@ -1393,33 +1391,30 @@ def __init__(self, colname): raise TypeError("'colname' must be a string") def col_exist(self, df): - """ - Check if the column 'colname' is in a dataframe 'df' + """Check if the column 'colname' is in a dataframe 'df'. Parameters ---------- df : pandas.DataFrame - The dataframe in which to look for the column + The dataframe in which to look for the column. Returns ------- bool True if the column is in the dataframe, False otherwise. - """ return self.colname in df def get_col(self, df): - """ - Extract the column 'colname' from the dataframe 'df' + """Extract the column 'colname' from the dataframe 'df'. Parameters ---------- df : pandas.DataFrame - The dataframe from which to extract the column 'colname' + The dataframe from which to extract the column 'colname'. Raises ------ @@ -1432,7 +1427,6 @@ def get_col(self, df): out : pandas.Series The column extracted from the dataframe. - """ try: out = df[self.colname].fillna("").apply(str) @@ -1444,9 +1438,9 @@ def get_col(self, df): return out -def get_col_or_value(x, df): +def _get_col_or_value(x, df): """Returns the column 'colname' from dataframe 'df' if x is a Col - object else returns the value of x convert to string. + object else returns the value of x converted to string. Parameters ---------- @@ -1458,8 +1452,9 @@ def get_col_or_value(x, df): Returns ------- + out : str or pandas.Series - A pandas.Series if x is of type 'Col' and a string. + A pandas.Series if x is of type 'Col' or a string otherwise. """ try: out = x.get_col(df) @@ -1468,35 +1463,32 @@ def get_col_or_value(x, df): return out -def get_all_cols(obj): - """ - Returns attributes of an instance that are of type Col. Note that this - method search for Col just in the attributes of the instances and will - not search in the attributes of the attributes (no recursive search). +def _get_all_cols(obj): + """Returns the attributes of type Col of a given instance. Note that this + method won't search for Col types in the attributes of the attributes + (no recursive search). Returns ------- - out : list of Col - The list of Col objects in the instance - + out : list (of Col) + The list of Col objects in the instance. """ attr_values = list(vars(obj).values()) out = [x for x in attr_values if isinstance(x, Col)] return out -def check_type(obj, expected_type): - """ - Check if an object is of valid type. +def _check_type(obj, expected_type): + """Check if the type of an object is valid. Parameters ---------- - arg : obj - The object to check - expected_type: type or tuple of types - The types to check against + obj : object + The object to check. + expected_type : type or tuple of types + The type(s) to check against. Raises ------ @@ -1504,53 +1496,48 @@ def check_type(obj, expected_type): TypeError If the argument is not of a valid type. """ - expected_type_str = str( - [x.__name__ for x in expected_type] - ) + expected_type_str = str([x.__name__ for x in expected_type]) if obj is not None: if not isinstance(obj, expected_type): raise TypeError( - "Parameter {} shoud be of class {}".format( + "Parameter {} should be of type(s) {}".format( obj, expected_type_str[1:-1] ) ) class SimpleLabel: - """The SimpleLabel class is used to build a simple label that will be plug - to build the parameters of the DFtoVW class. + """The SimpleLabel class is used to build a simple label for the + constructor of DFtoVW. """ expected_type = dict(name=(Col, str, int, float)) def __init__(self, name): - """ - Initialize a SimpleLabel instance. + """Initialize a SimpleLabel instance. Parameters ---------- - name : Col/int/float/str + name : Col/str/int/float A Col object specifying the column to extract from a dataframe or a - constant value of type int/float/str. + constant value of type str/int/float. Returns ------- self : SimpleLabel - """ - check_type(name, self.expected_type["name"]) + _check_type(name, self.expected_type["name"]) self.name = name def process(self, df): - """ - Returns the SimpleLabel string representation. + """Returns the SimpleLabel string representation. Parameters ---------- - df: pandas.DataFrame + df : pandas.DataFrame The dataframe from which to extract a column. Returns @@ -1559,15 +1546,14 @@ def process(self, df): str or pandas.Series The SimpleLabel string representation. """ - return get_col_or_value(self.name, df) + return _get_col_or_value(self.name, df) class Feature: - """The Feature class is used to build a feature for the DFtoVW class""" + """A feature for the constructor of DFtoVW""" expected_type = dict( - name=(Col, str, float, int), - value=(Col, str, float, int) + name=(Col, str, float, int), value=(Col, str, float, int) ) def __init__(self, value, name=None): @@ -1586,62 +1572,56 @@ def __init__(self, value, name=None): ------- self : Feature - """ - for (arg, name_arg) in zip([name, value], ["name", "value"]) : - check_type(arg, self.expected_type[name_arg]) + for (arg, name_arg) in zip([name, value], ["name", "value"]): + _check_type(arg, self.expected_type[name_arg]) self.name = name self.value = value def process(self, df): - """ - Returns the Feature string representation. + """Returns the Feature string representation. Parameters ---------- df : pandas.DataFrame - The dataframe from which to extract a column + The dataframe from which to extract a column. Returns ------- out : str or pandas.Series - The Feature string representation - + The Feature string representation. """ - value_col = get_col_or_value(self.value, df) + value_col = _get_col_or_value(self.value, df) if self.name is None: out = value_col else: - name_col = get_col_or_value(self.name, df) + name_col = _get_col_or_value(self.name, df) out = name_col + ":" + value_col return out class Namespace: - """The Namespace class is used to build a namespace for the DFtoVW class. - The Namespace is a container for Feature object(s). Hence, it must + """A namespace for the constructor of DFtoVW. + The Namespace is a container for Feature object(s), and thus must be composed of a Feature object or a list of Feature objects. """ expected_type = dict( - name=(str, int, float), - value=(int, float), - features=(list, Feature), + name=(str, int, float), value=(int, float), features=(list, Feature), ) def __init__(self, features, name=None, value=None): - """ - Initialize a Namespace instance. + """Initialize a Namespace instance. Parameters ---------- features : Feature or list of Feature - A (list of) Feature object(s) that will form the namespace + A (list of) Feature object(s) that form the namespace. name : str/int/float, optional - The name of the namespace + The name of the namespace. value : int/float, optional A constant that specify the scaling factor for the features of this namespace. @@ -1658,18 +1638,18 @@ def __init__(self, features, name=None, value=None): Returns ------- - self: Namespace - + self : Namespace """ - for (arg, name_arg) in zip([name, value, features], ["name", "value", "features"]) : - check_type(arg, self.expected_type[name_arg]) + for (arg, name_arg) in zip([name, value, features], + ["name", "value", "features"]): + _check_type(arg, self.expected_type[name_arg]) self.features = ( list(features) if isinstance(features, (list, set)) else [features] ) if (value is not None) and (name is None): raise ValueError( - "Namespace cannot have a 'value' argument without a 'name' argument" + "Namespace can't have a 'value' argument without a 'name' argument" ) self.name = name if value is not None: @@ -1677,9 +1657,7 @@ def __init__(self, features, name=None, value=None): self.value = value def process(self, df=None): - """ - Returns the Namespace string representation - """ + """Returns the Namespace string representation""" out = ["|"] if self.name is not None: out += str(self.name) @@ -1690,18 +1668,15 @@ def process(self, df=None): class DFtoVW: - """ - The DFtoVW is used to convert a pandas DataFrame to a suitable VW format. - Instances of this class are build using Col object(s) and classes such as - SimpleLabel, Feature or Namespace. + """Convert a pandas DataFrame to a suitable VW format. + Instances of this class are built with classes such as SimpleLabel, Feature + or Namespace (that can themselves be built on Col object(s)). The class also provided a convenience constructor to initialize the class - based on the target/features columns names only. + based on the target/features column names only. """ - def __init__(self, df, namespaces, - label=None, tag=None, base=None, importance=None): - """ - Initialize a DFtoVW instance + def __init__(self, df, namespaces, label=None, tag=None): + """Initialize a DFtoVW instance. Parameters ---------- @@ -1713,13 +1688,8 @@ def __init__(self, df, namespaces, more Feature object(s). label : SimpleLabel The label is the real numbers to be predicted for the examples. - importance : SimpleLabel - The importance (weight) indicating the relative importance of the - examples. - tag : SimpleLabel + tag : str The tag that is used as identifiers for the examples. - base : SimpleLabel - The base added to the prediction before computing an update. Examples -------- @@ -1753,8 +1723,7 @@ def __init__(self, df, namespaces, self.df = df self.n_rows = df.shape[0] self.targets = collections.OrderedDict() - for (key, value) in zip(["label", "importance", "base", "tag"], - [label, importance, base, tag]): + for (key, value) in zip(["label", "tag"], [label, tag]): self.targets[key] = value self.no_tag = tag is not None self.namespaces = ( @@ -1769,25 +1738,19 @@ def __init__(self, df, namespaces, self.out = self.empty_col() @classmethod - def from_colnames(cls, y, x, df, cbb_label=False): - """Simple interface to building formula. + def from_colnames(cls, y, x, df): + """Build DFtoVW instance using column names only. Parameters ---------- - cls : DFtoVW - DFtoVW will be initialized using the arguments of this simpler - interface + y : str/list - The column(s) for the label(s) + The column for the label. x : str/list - The column(s) for the feature(s) + The column(s) for the feature(s). df : pandas.DataFrame - The dataframe used - cbb_label : bool, optional - Should be set to True if the label represent contextual bandit - label. - The default is False. + The dataframe used. Raises ------ @@ -1809,10 +1772,9 @@ def from_colnames(cls, y, x, df, cbb_label=False): DFtoVW A initialized DFtoVW instance. - """ - if isinstance(y, list) and not cbb_label: + if isinstance(y, list): if len(y) == 1: y = y[0] else: @@ -1828,15 +1790,13 @@ def from_colnames(cls, y, x, df, cbb_label=False): return cls(namespaces=namespaces, label=label, df=df) def check_targets_type(self): - """ - Check targets arguments (label, tag, importance, base) conformity + """Check targets arguments (label, tag) conformity. Raises ------ TypeError - If any of the targets element is not of type SimpleLabel - + If any of the targets element is not of type SimpleLabel. """ wrong_type_targets = [ key @@ -1851,14 +1811,13 @@ def check_targets_type(self): ) def check_namespaces_type(self): - """ - Check namespaces arguments conformity + """Check if namespaces arguments are of type Namespace. Raises ------ TypeError - If parameters namespaces is not of type Namespace + If namespaces are not of type Namespace. """ wrong_type_namespaces = [ not isinstance(namespace, Namespace) @@ -1866,20 +1825,19 @@ def check_namespaces_type(self): ] if any(wrong_type_namespaces): raise TypeError( - "Parameter namespaces must be a (list of) Namespace object(s)" + "Argument `namespaces` should be " + "a Namespace object or a list of Namespace objects" ) def check_features_type(self): - """ - Check if elements of namespaces are of type features + """Check if 'features' attribute of namespaces are of type Feature. Raises ------ TypeError If parameters any of the element in a 'Namespace' is not of type - 'Feature' - + 'Feature'. """ for ns in self.namespaces: features = ns.features @@ -1888,19 +1846,19 @@ def check_features_type(self): ] if any(wrong_type_features): raise TypeError( - "Elements of 'Namespace' object must be of type 'Feature'" + "Argument 'features' of Namespace should be " + "a Feature object or a list of Feature objects" ) def check_if_cols_exist(self): - """ - Check if some columns specified are not in dataframe. + """Check if the columns specified in the constructor are in the + dataframe. Raises ------ ValueError - If some columns are not in the dataframe. - + If one or more columns are not in the dataframe. """ absent_cols = [] @@ -1910,7 +1868,7 @@ def check_if_cols_exist(self): for target in targets_not_none: absent_cols += [ x.colname - for x in get_all_cols(target) + for x in _get_all_cols(target) if not x.col_exist(self.df) ] @@ -1918,7 +1876,7 @@ def check_if_cols_exist(self): for feature in ns.features: absent_cols += [ x.colname - for x in get_all_cols(feature) + for x in _get_all_cols(feature) if not x.col_exist(self.df) ] @@ -1930,28 +1888,24 @@ def check_if_cols_exist(self): raise ValueError(msg_error) def empty_col(self): - """ - Create an empty string pandas column. + """Create an empty string column. Returns ------- pandas.Series A column of empty string with as much rows as the input dataframe. - """ return pd.Series([""] * self.n_rows) def process_targets(self): - """ - Process the targets into a unique pandas column + """Process the targets into a unique column. Returns ------- out : pandas.Series - A column where each row is the processed targets - + A column where each row is the processed targets. """ out = self.empty_col() @@ -1964,21 +1918,19 @@ def process_targets(self): return out def process_features(self, features): - """ - Process the features (of a namespace) into a unique pandas column + """Process the features (of a namespace) into a unique column. Parameters ---------- features : list of Feature - The list of Feature objects + The list of Feature objects. Returns ------- - out : pandas.series - The column of the processed features - + out : pandas.Series + The column of the processed features. """ out = self.empty_col() for feature in features: @@ -1986,14 +1938,13 @@ def process_features(self, features): return out def process_df(self): - """ - Main method that do the conversion of the dataframe to the VW format + """Main method that converts the pandas dataframe to the VW format. Returns ------- list - The list of parsed lines in VW format + The list of parsed lines in VW format. """ if not all(x is None for x in self.targets.values()): self.out += self.process_targets() @@ -2008,4 +1959,3 @@ def process_df(self): return self.out.to_list() - From 8fff16841152a5e59b2606a99f76046b02b988c0 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 22 May 2020 13:07:06 +0200 Subject: [PATCH 15/18] simplify tag parameter, add type checking for 'from_colnames' constructor, make not found columns method more explicit --- python/tests/test_pyvw.py | 13 +- python/vowpalwabbit/pyvw.py | 249 +++++++++++++++++++++++++++++------- 2 files changed, 208 insertions(+), 54 deletions(-) diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py index 949126ae6f8..26bf746a92b 100644 --- a/python/tests/test_pyvw.py +++ b/python/tests/test_pyvw.py @@ -359,7 +359,7 @@ def test_feature_column_renaming_and_tag(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DFtoVW( label=SimpleLabel(Col("y")), - tag=SimpleLabel(Col("idx")), + tag=Col("idx"), namespaces=Namespace([Feature(name="col_x", value=Col("x"))]), df=df, ) @@ -371,7 +371,7 @@ def test_feature_constant_column_with_empty_name(): df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]}) conv = DFtoVW( label=SimpleLabel(Col("y")), - tag=SimpleLabel(Col("idx")), + tag=Col("idx"), namespaces=Namespace([Feature(name="", value=2)]), df=df, ) @@ -429,13 +429,12 @@ def test_absent_col_error(): df = pd.DataFrame({"a": [1]}) conv = DFtoVW( df=df, - label=SimpleLabel(Col("b")), + label=SimpleLabel(Col("a")), namespaces=Namespace( - [Feature(Col("b")), Feature(Col("c")), Feature("d")] + [Feature(Col("a")), Feature(Col("c")), Feature("d")] ), ) - expected = "The following columns do not exist in the dataframe: '{}', '{}'".format( - "b", "c" - ) + expected = "In argument 'features', column(s) 'c' not found in dataframe" assert expected == str(value_error.value) + diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 9f41c9b2c15..966e41f3d1e 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1506,6 +1506,154 @@ def _check_type(obj, expected_type): ) +class Col: + """Refer to a column of a dataframe. + The methods of this class are used to: + - check if the column is in a specified dataframe + - extract the column from the specified dataframe + """ + + def __init__(self, colname): + """Initialize a Col object. + + Parameters + ---------- + + colname : str + The colname that refers to a column. + + Raises + ------ + + TypeError + If attribute 'colname' is not a string. + + Returns + ------- + + self : Col + """ + if isinstance(colname, str): + self.colname = colname + else: + raise TypeError("'colname' must be a string") + + def col_exist(self, df): + """Check if the column 'colname' is in a dataframe 'df'. + + Parameters + ---------- + + df : pandas.DataFrame + The dataframe in which to look for the column. + + Returns + ------- + + bool + True if the column is in the dataframe, False otherwise. + """ + return self.colname in df + + def get_col(self, df): + """Extract the column 'colname' from the dataframe 'df'. + + Parameters + ---------- + + df : pandas.DataFrame + The dataframe from which to extract the column 'colname'. + + Raises + ------ + + KeyError + If the column is not found in the dataframe. + + Returns + ------- + + out : pandas.Series + The column extracted from the dataframe. + """ + try: + out = df[self.colname].fillna("").apply(str) + except KeyError: + raise KeyError( + "Column '{}' not found in dataframe".format(self.colname) + ) + else: + return out + + +def _get_col_or_value(x, df): + """Returns the column 'colname' from dataframe 'df' if x is a Col + object else returns the value of x converted to string. + + Parameters + ---------- + + x : Col/str/int/float + The Col object or a literal value (str/int/float). + df : pandas.DataFrame + The dataframe in which to extract the column. + + Returns + ------- + + out : str or pandas.Series + A pandas.Series if x is of type 'Col' or a string otherwise. + """ + try: + out = x.get_col(df) + except AttributeError: + out = str(x) + return out + + +def _get_all_cols(obj): + """Returns the attributes of type Col of a given instance. Note that this + method won't search for Col types in the attributes of the attributes + (no recursive search). + + Returns + ------- + + out : list (of Col) + The list of Col objects in the instance. + """ + attr_values = list(vars(obj).values()) + out = [x for x in attr_values if isinstance(x, Col)] + return out + + +def _check_type(obj, expected_type): + """Check if the type of an object is valid. + + Parameters + ---------- + + obj : object + The object to check. + expected_type : type or tuple of types + The type(s) to check against. + + Raises + ------ + + TypeError + If the argument is not of a valid type. + """ + expected_type_str = str([x.__name__ for x in expected_type]) + if obj is not None: + if not isinstance(obj, expected_type): + raise TypeError( + "Parameter {} should be of type(s) {}".format( + obj, expected_type_str[1:-1] + ) + ) + + class SimpleLabel: """The SimpleLabel class is used to build a simple label for the constructor of DFtoVW. @@ -1688,7 +1836,7 @@ def __init__(self, df, namespaces, label=None, tag=None): more Feature object(s). label : SimpleLabel The label is the real numbers to be predicted for the examples. - tag : str + tag : Col or str The tag that is used as identifiers for the examples. Examples @@ -1722,16 +1870,14 @@ def __init__(self, df, namespaces, label=None, tag=None): """ self.df = df self.n_rows = df.shape[0] - self.targets = collections.OrderedDict() - for (key, value) in zip(["label", "tag"], [label, tag]): - self.targets[key] = value - self.no_tag = tag is not None + self.label = label + self.tag = tag self.namespaces = ( list(namespaces) if isinstance(namespaces, (list, set)) else [namespaces] ) - self.check_targets_type() + self.check_label_type() self.check_namespaces_type() self.check_features_type() self.check_if_cols_exist() @@ -1756,7 +1902,7 @@ def from_colnames(cls, y, x, df): ------ TypeError - DESCRIPTION. + If argument label is a list of multiple strings Examples -------- @@ -1779,8 +1925,14 @@ def from_colnames(cls, y, x, df): y = y[0] else: raise ValueError( - "Parameter should be a string or a list of one string" + "Parameter should a list of one string (or a string)" ) + if not isinstance(x, str): + raise TypeError("Argument 'x' should be a string") + if not isinstance(x, str): + raise TypeError( + "Argument 'y' should be a string or a list of one string" + ) label = SimpleLabel(Col(y)) x = list(x) if isinstance(x, (list, set)) else [x] @@ -1789,26 +1941,17 @@ def from_colnames(cls, y, x, df): ) return cls(namespaces=namespaces, label=label, df=df) - def check_targets_type(self): - """Check targets arguments (label, tag) conformity. + def check_label_type(self): + """Check label argument conformity. Raises ------ TypeError - If any of the targets element is not of type SimpleLabel. + If label is not of type SimpleLabel. """ - wrong_type_targets = [ - key - for (key, value) in self.targets.items() - if not isinstance(value, SimpleLabel) and value is not None - ] - if wrong_type_targets: - raise TypeError( - "Parameter(s) {} must be of type 'SimpleLabel'".format( - str(wrong_type_targets)[1:-1] - ) - ) + if not isinstance(self.label, SimpleLabel) and self.label is not None: + raise TypeError("Argument 'label' must be of type 'SimpleLabel'") def check_namespaces_type(self): """Check if namespaces arguments are of type Namespace. @@ -1860,31 +2003,46 @@ def check_if_cols_exist(self): ValueError If one or more columns are not in the dataframe. """ - absent_cols = [] + absent_cols = {} - targets_not_none = [ - target for target in self.targets.values() if target is not None - ] - for target in targets_not_none: - absent_cols += [ + if self.label is not None: + absent_cols["label"] = [ x.colname - for x in _get_all_cols(target) + for x in _get_all_cols(self.label) if not x.col_exist(self.df) ] + if self.tag is not None: + if isinstance(self.tag, Col) and not self.tag.col_exist(self.df): + absent_cols["tag"] = repr(self.tag.colname) + + missing_features_cols = [] for ns in self.namespaces: for feature in ns.features: - absent_cols += [ + missing_features_cols += [ x.colname for x in _get_all_cols(feature) if not x.col_exist(self.df) ] - - unique_absent_cols = sorted(list(set(absent_cols))) - if len(absent_cols) > 0: - msg_error = "The following columns do not exist in the dataframe: {}".format( - str(unique_absent_cols)[1:-1] + absent_cols["features"] = sorted(list(set(missing_features_cols))) + + absent_cols = { + key: value for (key, value) in absent_cols.items() if len(value) > 0 + } + msg_error = "" + for arg_name, missing_cols in absent_cols.items(): + missing_cols = ( + repr(missing_cols)[1:-1] + if isinstance(missing_cols, list) + else missing_cols + ) + if len(msg_error) > 0: + msg_error += "\n" + msg_error += "In argument '{}', column(s) {} not found in dataframe".format( + arg_name, missing_cols ) + + if absent_cols: raise ValueError(msg_error) def empty_col(self): @@ -1898,23 +2056,20 @@ def empty_col(self): """ return pd.Series([""] * self.n_rows) - def process_targets(self): - """Process the targets into a unique column. + def process_label_and_value(self): + """Process the label and value into a unique column. Returns ------- out : pandas.Series - A column where each row is the processed targets. + A column where each row is the processed label and value. """ out = self.empty_col() - - for name, value in self.targets.items(): - if value is not None: - to_add = value.process(self.df) - out += to_add if (name == "label") else (" " + to_add) - elif (value is None) and (name == "tag"): - out += " " + if self.label is not None: + out += self.label.process(self.df) + " " + if self.tag is not None: + out += _get_col_or_value(self.tag, self.df) return out def process_features(self, features): @@ -1946,8 +2101,8 @@ def process_df(self): list The list of parsed lines in VW format. """ - if not all(x is None for x in self.targets.values()): - self.out += self.process_targets() + if not all(x is None for x in [self.label, self.tag]): + self.out += self.process_label_and_value() for (num_ns, ns_obj) in enumerate(self.namespaces): to_add = ns_obj.process() + self.process_features(ns_obj.features) From ac6bd4ef7e76643934aaaeb8d11630b2200f6fe5 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Fri, 22 May 2020 17:52:29 +0200 Subject: [PATCH 16/18] fix type checking for x in 'from_colnames' constructor, remove unused collections import --- python/vowpalwabbit/pyvw.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 966e41f3d1e..a7f78a20dd0 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -5,7 +5,6 @@ import pylibvw import warnings import pandas as pd -import collections class SearchTask(): """Search task class""" @@ -1927,15 +1926,19 @@ def from_colnames(cls, y, x, df): raise ValueError( "Parameter should a list of one string (or a string)" ) - if not isinstance(x, str): - raise TypeError("Argument 'x' should be a string") - if not isinstance(x, str): + if not isinstance(y, str): raise TypeError( "Argument 'y' should be a string or a list of one string" ) label = SimpleLabel(Col(y)) + x = list(x) if isinstance(x, (list, set)) else [x] + if not all(isinstance(xi, str) for xi in x): + raise TypeError( + "Argument 'x' should be a string or a list of string" + ) + namespaces = Namespace( features=[Feature(value=Col(colname)) for colname in x] ) From 736a5699184f7a34f63d47a96beb8097b8abf5c7 Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Tue, 26 May 2020 16:16:43 +0200 Subject: [PATCH 17/18] change name of function process_label_and_value to process_label_and_tag --- python/vowpalwabbit/pyvw.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index a7f78a20dd0..6a736a3a352 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -2059,14 +2059,14 @@ def empty_col(self): """ return pd.Series([""] * self.n_rows) - def process_label_and_value(self): - """Process the label and value into a unique column. + def process_label_and_tag(self): + """Process the label and tag into a unique column. Returns ------- out : pandas.Series - A column where each row is the processed label and value. + A column where each row is the processed label and tag. """ out = self.empty_col() if self.label is not None: @@ -2105,7 +2105,7 @@ def process_df(self): The list of parsed lines in VW format. """ if not all(x is None for x in [self.label, self.tag]): - self.out += self.process_label_and_value() + self.out += self.process_label_and_tag() for (num_ns, ns_obj) in enumerate(self.namespaces): to_add = ns_obj.process() + self.process_features(ns_obj.features) From 883f2569250a8dc3d2a1025112ed02448a79df3c Mon Sep 17 00:00:00 2001 From: Etienne Kintzler Date: Tue, 26 May 2020 18:17:42 +0200 Subject: [PATCH 18/18] fix anomaly when calling process_df multiple times --- python/vowpalwabbit/pyvw.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index 6a736a3a352..439cd12c084 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -1880,7 +1880,6 @@ def __init__(self, df, namespaces, label=None, tag=None): self.check_namespaces_type() self.check_features_type() self.check_if_cols_exist() - self.out = self.empty_col() @classmethod def from_colnames(cls, y, x, df): @@ -2104,6 +2103,8 @@ def process_df(self): list The list of parsed lines in VW format. """ + self.out = self.empty_col() + if not all(x is None for x in [self.label, self.tag]): self.out += self.process_label_and_tag()