From 177405603ce570fe4cb4d5a0efab5efc5c16558c Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 1 May 2020 15:30:18 +0200
Subject: [PATCH 01/18] Add class that converts pandas.DataFrame to VW input
 format

---
 python/tests/test_pyvw.py   |  32 ++++++++
 python/vowpalwabbit/pyvw.py | 145 ++++++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index 4599ea827d0..f118f19743e 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -2,7 +2,9 @@
 
 from vowpalwabbit import pyvw
 from vowpalwabbit.pyvw import vw
+from vowpalwabbit.pyvw import DataFrameToVW
 import pytest
+import pandas as pd
 
 BIT_SIZE = 18
 
@@ -344,3 +346,33 @@ def check_error_raises(type, argument):
     """
     with pytest.raises(type) as error:
         argument()
+
+
+def test_oneline_simple_conversion():
+    df = pd.DataFrame({"y": [1], "x": [2]})	
+    conv = DataFrameToVW(df, "y | a")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "1 | 2"
+
+def test_oneline_with_column_renaming_and_tag():
+    df = pd.DataFrame({"idx":["id_1"], "y":[1], "x":[2]})
+    conv = DataFrameToVW(df, "y idx| col_x:x")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "1 id_1| col_x:2"
+
+def test_multiple_lines_conversion():
+    df = pd.DataFrame({"y": [1, -1], "x":[1, 2]})
+    conv = DataFrameToVW(df, "y | x")
+    lines_list = conv.process_df()
+    assert lines_list == ["1 | 1", "-1 | 2"]
+
+def test_oneline_with_multiple_namespaces():
+    df = pd.DataFrame({"y":[1], "a":[2], "b":[3]})
+    conv = DataFrameToVW(df, "y |FirstNameSpace a |DoubleIt:2 b")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3"
+
+
diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 584d88eeaef..08826e207a2 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -4,6 +4,7 @@
 from __future__ import division
 import pylibvw
 import warnings
+import pandas as pd
 
 class SearchTask():
     """Search task class"""
@@ -1354,3 +1355,147 @@ def get_label(self, label_class=simple_label):
             simple_label
         """
         return label_class(self)
+
+
+
+class DataFrameToVW:
+    def __init__(self, df, formula):
+        """
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            The dataframe to convert
+        formula : str
+            The formula specifying the VW ouput needed
+
+        Examples
+        --------
+
+        >>> from vowpalwabbit import DataFrameToVW
+        >>> from pandas as pd
+        >>> df = pd.DataFrame({"y": [0], "x": [1]})
+        >>> conv = DataFrameToVW(df, "y | x")
+        >>> vw_lines = conv.process_df()
+
+        Returns
+        -------
+        self: DataFrameToVW
+        """
+        self.df = df
+        self.n_rows = df.shape[0]
+        self.column_names = set(df.columns)
+        self.formula = formula
+
+    def process_target_space(self, target_space):
+        """
+        Helper function that process the target space.
+
+        Parameters
+        ----------
+        target_space : str
+            A formula representing the target space : [label] [importance] [base] [tag]
+
+        Raises
+        ------
+        ValueError
+            If the column specified in the formula does not exist in the dataframe.
+
+
+        Returns
+        -------
+        out : pd.Series
+            The pd.Series of the lines of the feature space
+
+        """
+        no_tag = target_space.endswith(" ")
+
+        splitted = target_space.split()
+        absent_cols = [col not in self.column_names for col in splitted]
+        if any(absent_cols):
+            raise ValueError(
+                "Column(s) '{}' not in data.frame 'df'".format(absent_cols)
+            )
+
+        out = pd.Series([""] * self.n_rows)
+        for (i, col) in enumerate(splitted):
+            if i == 0:
+                out += self.df[col].apply(str)
+            else:
+                out += " " + self.df[col].apply(str)
+
+        if no_tag:
+            out += " "
+
+        return out
+
+    def process_feature_space(self, features_space):
+        """
+        Helper function that process the formula for a given features space.
+
+        Parameters
+        ----------
+        features_space : str
+            The formula that contains the features. A
+        namespace can optionally be added.
+
+        Raises
+        ------
+        ValueError
+            If the column specified in the formula does not exist in the dataframe.
+
+        Returns
+        -------
+        out : pd.Series
+            The pd.Series of the lines of the feature space
+
+        """
+
+        has_namespace = not features_space.startswith(" ")
+        if has_namespace:
+            splitted = features_space.rstrip().split()
+            namespace, features = splitted[0], splitted[1:]
+            out = pd.Series([namespace] * self.n_rows)
+        else:
+            features = features_space.strip().split()
+            out = pd.Series([""] * self.n_rows)
+
+        for feature in features:
+            if ":" in feature:
+                feature_name, col_name = feature.split(":")
+                feature_name += ":"
+            else:
+                feature_name, col_name = "", feature
+            if col_name not in self.column_names:
+                raise ValueError(
+                    "Column '{}' not in data.frame 'df'".format(col_name)
+                )
+            col_str = self.df[col_name].apply(str)
+            out += " " + feature_name + col_str
+        out += " "
+        return out
+
+    def process_df(self):
+        """
+        Convert pandas.DataFrame to a suitable Vowpal Wabbit format
+
+        Parameters
+        ----------
+
+        Returns
+        -------
+        list
+            The list of the VW lines
+
+        """
+        splitted_formula = self.formula.split("|")
+        target_space, features_spaces = splitted_formula[0], splitted_formula[1:]
+        out = self.process_target_space(target_space)
+        features_list = [
+            self.process_feature_space(features_space)
+            for features_space in features_spaces
+        ]
+        for f in features_list:
+            out += "|"+ f
+        return out.str.rstrip().to_list()
+
+

From 76b62e00e68f9bfe60fa7947b225624b4acaa092 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 1 May 2020 15:40:49 +0200
Subject: [PATCH 02/18] fix docstring

---
 python/vowpalwabbit/pyvw.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 08826e207a2..dfcbed4ce9d 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1359,6 +1359,7 @@ def get_label(self, label_class=simple_label):
 
 
 class DataFrameToVW:
+    """DataFrameToVW class"""
     def __init__(self, df, formula):
         """
         Parameters
@@ -1366,7 +1367,7 @@ def __init__(self, df, formula):
         df : pandas.DataFrame
             The dataframe to convert
         formula : str
-            The formula specifying the VW ouput needed
+            The formula specifying the desired vowpal wabbit input format
 
         Examples
         --------
@@ -1483,8 +1484,8 @@ def process_df(self):
 
         Returns
         -------
-        list
-            The list of the VW lines
+        out
+            The list of the lines of the DataFrame in vowpal wabbit input format
 
         """
         splitted_formula = self.formula.split("|")

From 566a81ee35c72573344091b853f95337cf7ad2a9 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 1 May 2020 16:03:34 +0200
Subject: [PATCH 03/18] fix typo in test_pyvw.py

---
 python/tests/test_pyvw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index f118f19743e..8bf6ef0e532 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -350,7 +350,7 @@ def check_error_raises(type, argument):
 
 def test_oneline_simple_conversion():
     df = pd.DataFrame({"y": [1], "x": [2]})	
-    conv = DataFrameToVW(df, "y | a")
+    conv = DataFrameToVW(df, "y | x")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 | 2"

From 3d00f04fa9b89c562f0d8dc4a2721ab6ac6a19af Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 1 May 2020 16:22:23 +0200
Subject: [PATCH 04/18] fix docstring in pyvw.py

---
 python/vowpalwabbit/pyvw.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index dfcbed4ce9d..c5182f264d5 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1365,7 +1365,7 @@ def __init__(self, df, formula):
         Parameters
         ----------
         df : pandas.DataFrame
-            The dataframe to convert
+            The DataFrame to convert
         formula : str
             The formula specifying the desired vowpal wabbit input format
 
@@ -1399,13 +1399,13 @@ def process_target_space(self, target_space):
         Raises
         ------
         ValueError
-            If the column specified in the formula does not exist in the dataframe.
+            If the column specified in the formula does not exist in the dataframe
 
 
         Returns
         -------
         out : pd.Series
-            The pd.Series of the lines of the feature space
+            The pd.Series of the lines of the target space
 
         """
         no_tag = target_space.endswith(" ")
@@ -1436,13 +1436,12 @@ def process_feature_space(self, features_space):
         Parameters
         ----------
         features_space : str
-            The formula that contains the features. A
-        namespace can optionally be added.
+            The formula that contains the features. A namespace can optionally be added
 
         Raises
         ------
         ValueError
-            If the column specified in the formula does not exist in the dataframe.
+            If the column specified in the formula does not exist in the dataframe
 
         Returns
         -------
@@ -1477,7 +1476,7 @@ def process_feature_space(self, features_space):
 
     def process_df(self):
         """
-        Convert pandas.DataFrame to a suitable Vowpal Wabbit format
+        Convert pandas.DataFrame to a suitable vowpal wabbit input format
 
         Parameters
         ----------

From 55043af4e2d69ab14d326b0da3d24d27373c263d Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 1 May 2020 19:45:21 +0200
Subject: [PATCH 05/18] add test to DataFrameToVW to test conversion when no
 target is present. Fix code style

---
 python/tests/test_pyvw.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index 8bf6ef0e532..b2cc245323e 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -349,30 +349,36 @@ def check_error_raises(type, argument):
 
 
 def test_oneline_simple_conversion():
-    df = pd.DataFrame({"y": [1], "x": [2]})	
+    df = pd.DataFrame({"y": [1], "x": [2]})
     conv = DataFrameToVW(df, "y | x")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 | 2"
 
 def test_oneline_with_column_renaming_and_tag():
-    df = pd.DataFrame({"idx":["id_1"], "y":[1], "x":[2]})
+    df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
     conv = DataFrameToVW(df, "y idx| col_x:x")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 id_1| col_x:2"
 
 def test_multiple_lines_conversion():
-    df = pd.DataFrame({"y": [1, -1], "x":[1, 2]})
+    df = pd.DataFrame({"y": [1, -1], "x": [1, 2]})
     conv = DataFrameToVW(df, "y | x")
     lines_list = conv.process_df()
     assert lines_list == ["1 | 1", "-1 | 2"]
 
 def test_oneline_with_multiple_namespaces():
-    df = pd.DataFrame({"y":[1], "a":[2], "b":[3]})
+    df = pd.DataFrame({"y": [1], "a": [2], "b": [3]})
     conv = DataFrameToVW(df, "y |FirstNameSpace a |DoubleIt:2 b")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3"
 
+def test_oneline_without_target():
+    df = pd.DataFrame({"a": [2], "b": [3]})
+    conv = DataFrameToVW(df, "| a b")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "| 2 3"
 

From 31aa448eb119ebf087c835372b4e41682cdce7bb Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Tue, 5 May 2020 21:33:02 +0200
Subject: [PATCH 06/18] specify col in formula using {}, enable more freedom in
 formatting, check for absent cols at initialization, change formulas in tests

---
 python/tests/test_pyvw.py   |  14 +++--
 python/vowpalwabbit/pyvw.py | 116 +++++++++---------------------------
 2 files changed, 38 insertions(+), 92 deletions(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index b2cc245323e..f3053ce8a30 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -350,34 +350,38 @@ def check_error_raises(type, argument):
 
 def test_oneline_simple_conversion():
     df = pd.DataFrame({"y": [1], "x": [2]})
-    conv = DataFrameToVW(df, "y | x")
+    conv = DataFrameToVW(df, "{y} | {x}")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 | 2"
 
+
 def test_oneline_with_column_renaming_and_tag():
     df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
-    conv = DataFrameToVW(df, "y idx| col_x:x")
+    conv = DataFrameToVW(df, "{y} {idx}| col_x:{x}")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 id_1| col_x:2"
 
+
 def test_multiple_lines_conversion():
     df = pd.DataFrame({"y": [1, -1], "x": [1, 2]})
-    conv = DataFrameToVW(df, "y | x")
+    conv = DataFrameToVW(df, "{y} | {x}")
     lines_list = conv.process_df()
     assert lines_list == ["1 | 1", "-1 | 2"]
 
+
 def test_oneline_with_multiple_namespaces():
     df = pd.DataFrame({"y": [1], "a": [2], "b": [3]})
-    conv = DataFrameToVW(df, "y |FirstNameSpace a |DoubleIt:2 b")
+    conv = DataFrameToVW(df, "{y} |FirstNameSpace {a} |DoubleIt:2 {b}")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3"
 
+
 def test_oneline_without_target():
     df = pd.DataFrame({"a": [2], "b": [3]})
-    conv = DataFrameToVW(df, "| a b")
+    conv = DataFrameToVW(df, "| {a} {b}")
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "| 2 3"
diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index c5182f264d5..d54d272ab83 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -5,6 +5,7 @@
 import pylibvw
 import warnings
 import pandas as pd
+import re
 
 class SearchTask():
     """Search task class"""
@@ -1357,9 +1358,11 @@ def get_label(self, label_class=simple_label):
         return label_class(self)
 
 
-
 class DataFrameToVW:
     """DataFrameToVW class"""
+
+    re_parse_col = re.compile(pattern="{([^{}]*)}")
+
     def __init__(self, df, formula):
         """
         Parameters
@@ -1385,94 +1388,29 @@ def __init__(self, df, formula):
         self.df = df
         self.n_rows = df.shape[0]
         self.column_names = set(df.columns)
-        self.formula = formula
+        self.formula = re.sub("\s+", " ", formula).strip()
+        self.check_absent_cols()
 
-    def process_target_space(self, target_space):
+    def check_absent_cols(self):
         """
-        Helper function that process the target space.
-
-        Parameters
-        ----------
-        target_space : str
-            A formula representing the target space : [label] [importance] [base] [tag]
-
+        Helper function that check if any of the column specified in the formula is missing.
+        The function raises value error if any of the column is absent.
+        
         Raises
         ------
         ValueError
             If the column specified in the formula does not exist in the dataframe
 
-
-        Returns
-        -------
-        out : pd.Series
-            The pd.Series of the lines of the target space
-
         """
-        no_tag = target_space.endswith(" ")
 
-        splitted = target_space.split()
-        absent_cols = [col not in self.column_names for col in splitted]
+        all_cols = self.re_parse_col.findall(self.formula)
+        absent_cols = [col for col in all_cols if col not in self.column_names]
         if any(absent_cols):
             raise ValueError(
-                "Column(s) '{}' not in data.frame 'df'".format(absent_cols)
-            )
-
-        out = pd.Series([""] * self.n_rows)
-        for (i, col) in enumerate(splitted):
-            if i == 0:
-                out += self.df[col].apply(str)
-            else:
-                out += " " + self.df[col].apply(str)
-
-        if no_tag:
-            out += " "
-
-        return out
-
-    def process_feature_space(self, features_space):
-        """
-        Helper function that process the formula for a given features space.
-
-        Parameters
-        ----------
-        features_space : str
-            The formula that contains the features. A namespace can optionally be added
-
-        Raises
-        ------
-        ValueError
-            If the column specified in the formula does not exist in the dataframe
-
-        Returns
-        -------
-        out : pd.Series
-            The pd.Series of the lines of the feature space
-
-        """
-
-        has_namespace = not features_space.startswith(" ")
-        if has_namespace:
-            splitted = features_space.rstrip().split()
-            namespace, features = splitted[0], splitted[1:]
-            out = pd.Series([namespace] * self.n_rows)
-        else:
-            features = features_space.strip().split()
-            out = pd.Series([""] * self.n_rows)
-
-        for feature in features:
-            if ":" in feature:
-                feature_name, col_name = feature.split(":")
-                feature_name += ":"
-            else:
-                feature_name, col_name = "", feature
-            if col_name not in self.column_names:
-                raise ValueError(
-                    "Column '{}' not in data.frame 'df'".format(col_name)
+                "Column(s) {} not in the DataFrame".format(
+                    str(absent_cols)[1:-1]
                 )
-            col_str = self.df[col_name].apply(str)
-            out += " " + feature_name + col_str
-        out += " "
-        return out
+            )
 
     def process_df(self):
         """
@@ -1487,15 +1425,19 @@ def process_df(self):
             The list of the lines of the DataFrame in vowpal wabbit input format
 
         """
-        splitted_formula = self.formula.split("|")
-        target_space, features_spaces = splitted_formula[0], splitted_formula[1:]
-        out = self.process_target_space(target_space)
-        features_list = [
-            self.process_feature_space(features_space)
-            for features_space in features_spaces
-        ]
-        for f in features_list:
-            out += "|"+ f
-        return out.str.rstrip().to_list()
+        matches = list(self.re_parse_col.finditer(self.formula))
+        out = pd.Series([""] * self.n_rows)
+
+        current_pos = 0
+        for match in matches:
+            col_name = match.group()[1:-1]
+            start_pos, end_pos = match.span()
+            str_part = self.formula[current_pos:start_pos]
+            value_part = self.df[col_name].apply(str)
+            out += str_part + value_part
+            current_pos = end_pos
+        out += self.formula[current_pos : len(self.formula)]
+
+        return out.to_list()
 
 

From a55705e6c5e443b163203928b74f057442682a5e Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Wed, 6 May 2020 00:09:31 +0200
Subject: [PATCH 07/18] add check formula conformity + fix docstring. Add test
 for absent columns

---
 python/tests/test_pyvw.py   |  6 ++++
 python/vowpalwabbit/pyvw.py | 56 +++++++++++++++++++++++++++++++------
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index f3053ce8a30..c743fc502ac 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -386,3 +386,9 @@ def test_oneline_without_target():
     first_line = lines_list[0]
     assert first_line == "| 2 3"
 
+
+def test_absent_col_error():
+    with pytest.raises(ValueError) as value_error:
+        df = pd.DataFrame({"a": [1]})
+        conv = DataFrameToVW(df, "{a} | {b} {c}")
+    assert "Column(s) 'b', 'c' not in the DataFrame" == str(value_error.value)
diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index d54d272ab83..4947715fff1 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1363,14 +1363,38 @@ class DataFrameToVW:
 
     re_parse_col = re.compile(pattern="{([^{}]*)}")
 
+    feature_name_pattern = "(?:\w+[:*])"
+    feature_value_pattern = "{[^{}]+}"
+    const_value_pattern = "\w+"
+    before_words, words, after_words = (
+        "\s*\|?\s*",
+        "(?:{[^{}]+}|[\w:*]+)",
+        "\s*",
+    )
+    re_check_formula = re.compile(
+        "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format(
+            feature_name_pattern, feature_value_pattern, const_value_pattern
+        )
+    )
+
     def __init__(self, df, formula):
         """
+        Convert a pandas DataFrame to the vowpal wabbit format defined by the user in formula parameter.
+        Formula is a string where the feature value of a given column is specified using 
+        the curly braces syntax (e.g: {name_of_the_column}). The part of the formula not specified
+        in curly braces will be considered constant and repeated on each line. See examples 
+        for more details.
+
+        The following column names cannot be used in the formula :
+            - column names that contain the character '{' or '}' 
+            - the empty string ''
+        
         Parameters
         ----------
         df : pandas.DataFrame
             The DataFrame to convert
         formula : str
-            The formula specifying the desired vowpal wabbit input format
+            The formula specifying the desired vowpal wabbit input format. 
 
         Examples
         --------
@@ -1378,8 +1402,12 @@ def __init__(self, df, formula):
         >>> from vowpalwabbit import DataFrameToVW
         >>> from pandas as pd
         >>> df = pd.DataFrame({"y": [0], "x": [1]})
-        >>> conv = DataFrameToVW(df, "y | x")
+        >>> conv = DataFrameToVW(df, "{y} | {x}")
         >>> vw_lines = conv.process_df()
+        
+        >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]})
+        >>> conv2 = DataFrameToVW(df, '{y} |AllFeatures {x} {z}')
+        >>> vw_lines2 = conv.process_df()
 
         Returns
         -------
@@ -1389,8 +1417,24 @@ def __init__(self, df, formula):
         self.n_rows = df.shape[0]
         self.column_names = set(df.columns)
         self.formula = re.sub("\s+", " ", formula).strip()
+        self.check_formula()
         self.check_absent_cols()
 
+    def check_formula(self):
+        """
+        Check if formula is of appropriate format
+        """
+        match = self.re_check_formula.match(self.formula)
+        valid_formula = match.group() == self.formula
+        if not valid_formula:
+            valid_part = self.formula[: match.end()]
+            invalid_part = self.formula[match.end() :]
+            raise ValueError(
+                "Error parsing formula.\nValid: '{}'\nNot valid: '{}'".format(
+                    valid_part, invalid_part
+                )
+            )
+
     def check_absent_cols(self):
         """
         Helper function that check if any of the column specified in the formula is missing.
@@ -1406,19 +1450,15 @@ def check_absent_cols(self):
         all_cols = self.re_parse_col.findall(self.formula)
         absent_cols = [col for col in all_cols if col not in self.column_names]
         if any(absent_cols):
+            absent_cols_str = str(absent_cols)[1:-1]
             raise ValueError(
-                "Column(s) {} not in the DataFrame".format(
-                    str(absent_cols)[1:-1]
-                )
+                "Column(s) {} not in the DataFrame".format(absent_cols_str)
             )
 
     def process_df(self):
         """
         Convert pandas.DataFrame to a suitable vowpal wabbit input format
 
-        Parameters
-        ----------
-
         Returns
         -------
         out

From 4d223559a19ca1e498a9a55c68b2b716bfe576e0 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Wed, 6 May 2020 00:23:23 +0200
Subject: [PATCH 08/18] fix pattern to allow decimal value

---
 python/vowpalwabbit/pyvw.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 4947715fff1..3510c2f9b60 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1365,12 +1365,7 @@ class DataFrameToVW:
 
     feature_name_pattern = "(?:\w+[:*])"
     feature_value_pattern = "{[^{}]+}"
-    const_value_pattern = "\w+"
-    before_words, words, after_words = (
-        "\s*\|?\s*",
-        "(?:{[^{}]+}|[\w:*]+)",
-        "\s*",
-    )
+    const_value_pattern = "[\w.]+"
     re_check_formula = re.compile(
         "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format(
             feature_name_pattern, feature_value_pattern, const_value_pattern

From 66de092954a9bf0765597585b67deeed8d333b03 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Wed, 6 May 2020 00:59:06 +0200
Subject: [PATCH 09/18] fix typo in docstring of DataFrameToVW.__init__

---
 python/vowpalwabbit/pyvw.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 3510c2f9b60..94af4c9a8b3 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1401,8 +1401,8 @@ def __init__(self, df, formula):
         >>> vw_lines = conv.process_df()
         
         >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]})
-        >>> conv2 = DataFrameToVW(df, '{y} |AllFeatures {x} {z}')
-        >>> vw_lines2 = conv.process_df()
+        >>> conv2 = DataFrameToVW(df2, '{y} |AllFeatures {x} {z}')
+        >>> vw_lines2 = conv2.process_df()
 
         Returns
         -------

From 13a4441f18c02087923262a6ff26be3e5ede5f15 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Wed, 13 May 2020 18:45:24 +0200
Subject: [PATCH 10/18] create class based formula for the conversion of
 datafame to vw input format

---
 python/tests/test_pyvw.py   |  85 ++++-
 python/vowpalwabbit/pyvw.py | 703 ++++++++++++++++++++++++++++++++----
 2 files changed, 695 insertions(+), 93 deletions(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index c743fc502ac..c6a49f70408 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -2,7 +2,7 @@
 
 from vowpalwabbit import pyvw
 from vowpalwabbit.pyvw import vw
-from vowpalwabbit.pyvw import DataFrameToVW
+from vowpalwabbit.pyvw import DataFrameToVW, SimpleLabel, Feature, Namespace, Col
 import pytest
 import pandas as pd
 
@@ -347,48 +347,95 @@ def check_error_raises(type, argument):
     with pytest.raises(type) as error:
         argument()
 
-
-def test_oneline_simple_conversion():
+def test_from_colnames_constructor():
     df = pd.DataFrame({"y": [1], "x": [2]})
-    conv = DataFrameToVW(df, "{y} | {x}")
+    conv = DFtoVW.from_colnames(y="y", x=["x"], df=df)
     lines_list = conv.process_df()
     first_line = lines_list[0]
     assert first_line == "1 | 2"
 
 
-def test_oneline_with_column_renaming_and_tag():
+def test_feature_column_renaming_and_tag():
     df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
-    conv = DataFrameToVW(df, "{y} {idx}| col_x:{x}")
-    lines_list = conv.process_df()
-    first_line = lines_list[0]
+    conv = DFtoVW(
+        label=SimpleLabel(Col("y")),
+        tag=SimpleLabel(Col("idx")),
+        namespaces=Namespace([Feature(name="col_x", value=Col("x"))]),
+        df=df,
+    )
+    first_line = conv.process_df()[0]
     assert first_line == "1 id_1| col_x:2"
 
 
+def test_feature_constant_column_with_empty_name():
+    df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
+    conv = DFtoVW(
+        label=SimpleLabel(Col("y")),
+        tag=SimpleLabel(Col("idx")),
+        namespaces=Namespace([Feature(name="", value=2)]),
+        df=df,
+    )
+    first_line = conv.process_df()[0]
+    assert first_line == "1 id_1| :2"
+
+
+def test_feature_variable_column_name():
+    df = pd.DataFrame({"y": [1], "x": [2], "a": ["col_x"]})
+    conv = DFtoVW(
+        label=SimpleLabel(Col("y")),
+        namespaces=Namespace(Feature(name=Col("a"), value=Col("x"))),
+        df=df,
+    )
+    first_line = conv.process_df()[0]
+    assert first_line == "1 | col_x:2"
+
+
 def test_multiple_lines_conversion():
     df = pd.DataFrame({"y": [1, -1], "x": [1, 2]})
-    conv = DataFrameToVW(df, "{y} | {x}")
+    conv = DFtoVW(
+        label=SimpleLabel(Col("y")),
+        namespaces=Namespace(Feature(value=Col("x"))),
+        df=df,
+    )
     lines_list = conv.process_df()
     assert lines_list == ["1 | 1", "-1 | 2"]
 
 
-def test_oneline_with_multiple_namespaces():
+def test_multiple_namespaces():
     df = pd.DataFrame({"y": [1], "a": [2], "b": [3]})
-    conv = DataFrameToVW(df, "{y} |FirstNameSpace {a} |DoubleIt:2 {b}")
-    lines_list = conv.process_df()
-    first_line = lines_list[0]
+    conv = DFtoVW(
+        df=df,
+        label=SimpleLabel(Col("y")),
+        namespaces=[
+            Namespace(name="FirstNameSpace", features=Feature(Col("a"))),
+            Namespace(name="DoubleIt", value=2, features=Feature(Col("b"))),
+        ],
+    )
+    first_line = conv.process_df()[0]
     assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3"
 
 
-def test_oneline_without_target():
+def test_without_target():
     df = pd.DataFrame({"a": [2], "b": [3]})
-    conv = DataFrameToVW(df, "| {a} {b}")
-    lines_list = conv.process_df()
-    first_line = lines_list[0]
+    conv = DFtoVW(
+        df=df, namespaces=Namespace([Feature(Col("a")), Feature(Col("b"))])
+    )
+    first_line = conv.process_df()[0]
     assert first_line == "| 2 3"
 
 
 def test_absent_col_error():
     with pytest.raises(ValueError) as value_error:
         df = pd.DataFrame({"a": [1]})
-        conv = DataFrameToVW(df, "{a} | {b} {c}")
-    assert "Column(s) 'b', 'c' not in the DataFrame" == str(value_error.value)
+        conv = DFtoVW(
+            df=df,
+            label=SimpleLabel(Col("b")),
+            namespaces=Namespace(
+                [Feature(Col("b")), Feature(Col("c")), Feature("d")]
+            ),
+        )
+    expected = "The following columns do not exist in the dataframe: '{}', '{}'".format(
+        "b", "c"
+    )
+    assert expected == str(value_error.value)
+
diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 94af4c9a8b3..62ac1af8bb8 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -5,7 +5,8 @@
 import pylibvw
 import warnings
 import pandas as pd
-import re
+import abc
+import collections
 
 class SearchTask():
     """Search task class"""
@@ -1358,121 +1359,675 @@ def get_label(self, label_class=simple_label):
         return label_class(self)
 
 
-class DataFrameToVW:
-    """DataFrameToVW class"""
+class Col:
+    """Col is a convenience class to refer to a column of a dataframe.
+    Its methods can:
+        - check if the column is in a specified dataframe
+        - extract the column from the specified dataframe
+    """
+
+    def __init__(self, colname):
+        """
+        Initialize a Col object
+
+        Parameters
+        ----------
+
+        colname : str
+            The colname that refers to a column
+
+        Raises
+        ------
+
+        TypeError
+            If attribute 'colname' is not a string
+
+        Returns
+        -------
+
+        self : Col
+
+        """
+        if isinstance(colname, str):
+            self.colname = colname
+        else:
+            raise TypeError("'colname' must be a string")
+
+    def col_exist(self, df):
+        """
+        Check if the column 'colname' is in a dataframe 'df'
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            The dataframe in which to look for the column
+
+        Returns
+        -------
+
+        bool
+            True if the column is in the dataframe, False otherwise.
+
+        """
+        return self.colname in df
+
+    def get_col(self, df):
+        """
+        Extract the column 'colname' from the dataframe 'df'
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            The dataframe from which to extract the column 'colname'
+
+        Raises
+        ------
+
+        KeyError
+            If the column is not found in the dataframe.
+
+        Returns
+        -------
+
+        out : pandas.Series
+            The column extracted from the dataframe.
+
+        """
+        try:
+            out = df[self.colname].fillna("").apply(str)
+        except KeyError:
+            raise KeyError(
+                "Column '{}' not found in dataframe".format(self.colname)
+            )
+        else:
+            return out
+
 
-    re_parse_col = re.compile(pattern="{([^{}]*)}")
+class FormulaType(abc.ABC):
+    """
+    The FormulaType class in an abstract class from which to subclasses the
+    types that will be used in the DFtoVW class.
+    The method 'process' is abstract and must be implemented in the subclass.
+    The class has two concrete implementations 'check_type' and
+    'get_col_or_value'. They are helper functions that are used in subclasses
+    to check the type of the parameters passed when initializing objects and
+    to handle the values passed that can be either a literal (int/float/str) or
+    a Col object.
+    """
+
+    @abc.abstractmethod
+    def process(self, df):
+        """
+        Abstract method that build the subclasses Feature/SimpleLabel/Namespace
+        string representation. If the subclasses are initialized using Col
+        object(s), the result will be a column, otherwise it will be a string.
+
+        Parameters
+        ----------
+        df : pandas.DataFrame, optional
+            The dataframe from which to extract column.
+
+        """
 
-    feature_name_pattern = "(?:\w+[:*])"
-    feature_value_pattern = "{[^{}]+}"
-    const_value_pattern = "[\w.]+"
-    re_check_formula = re.compile(
-        "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format(
-            feature_name_pattern, feature_value_pattern, const_value_pattern
+    def get_col_or_value(self, x, df):
+        """Returns the column 'colname' from dataframe 'df' if x is a Col
+        object else returns the value of x convert to string.
+
+        Parameters
+        ----------
+
+        x : Col/str/int/float
+            The Col object or a literal value (str/int/float).
+        df : pandas.DataFrame
+            The dataframe in which to extract the column.
+        
+        Returns
+        -------
+        out : str or pandas.Series
+            A pandas.Series if x is of type 'Col' and a string.
+        """
+        try:
+            out = x.get_col(df)
+        except AttributeError:
+            out = str(x)
+        return out
+
+    def get_all_cols(self):
+        """
+        Returns attributes of an instance that are of type Col. Note that this
+        method search for Col just in the attributes of the instances and will
+        not search in the attributes of the attributes (no recursive search).
+
+        Returns
+        -------
+
+        out : list of Col
+            The list of Col objects in the instance
+
+        """
+        attr_values = list(vars(self).values())
+        out = [x for x in attr_values if isinstance(x, Col)]
+        return out
+
+    def check_type(self, arg_name, arg_value):
+        """
+        Check if the type of an argument is valid. Typically used in the
+        __init__ method of the subclasses to check arguments conformity.
+        Using this method requires that a dict 'expected_type' exists in the
+        class of the calling instance.
+
+        Parameters
+        ----------
+
+        arg_name : str
+            The argument name.
+        arg_value : object
+            The argument value to check.
+
+        Raises
+        ------
+
+        TypeError
+            If the argument is not of a valid type.
+        """
+        expected_type_str = str(
+            [x.__name__ for x in self.expected_type[arg_name]]
         )
+        if arg_value is not None:
+            if not isinstance(arg_value, self.expected_type[arg_name]):
+                raise TypeError(
+                    "Parameter {} shoud be of class {}".format(
+                        arg_name, expected_type_str[1:-1]
+                    )
+                )
+
+
+class SimpleLabel(FormulaType):
+    """The SimpleLabel class is used to build a simple label that will be plug
+    to build the parameters of the DFtoVW class.
+    """
+
+    expected_type = dict(name=(Col, str, int, float))
+
+    def __init__(self, name):
+        """
+        Initialize a SimpleLabel instance.
+
+        Parameters
+        ----------
+
+        name : Col/int/float/str
+            A Col object specifying the column to extract from a dataframe or a
+            constant value of type int/float/str.
+
+        Returns
+        -------
+
+        self : SimpleLabel
+
+        """
+        super().check_type("name", name)
+        self.name = name
+
+    def process(self, df):
+        """
+        Returns the SimpleLabel string representation.
+
+        Parameters
+        ----------
+
+        df: pandas.DataFrame
+            The dataframe from which to extract a column.
+
+        Returns
+        -------
+
+        str or pandas.Series
+            The SimpleLabel string representation.
+        """
+        return super().get_col_or_value(self.name, df)
+
+
+class Feature(FormulaType):
+    """The Feature class is used to build a feature for the DFtoVW class"""
+
+    expected_type = dict(
+        name=(Col, str, float, int),
+        value=(Col, str, float, int)
     )
 
-    def __init__(self, df, formula):
+    def __init__(self, value, name=None):
         """
-        Convert a pandas DataFrame to the vowpal wabbit format defined by the user in formula parameter.
-        Formula is a string where the feature value of a given column is specified using 
-        the curly braces syntax (e.g: {name_of_the_column}). The part of the formula not specified
-        in curly braces will be considered constant and repeated on each line. See examples 
-        for more details.
+        Initialize a Feature instance.
+
+        Parameters
+        ----------
+
+        value : str/float/int or Col
+            The value of the feature. Can be a literal or a Col object.
+        name : str/float/int or Col, optional
+            The name of the feature. Can be constant value or a Col object.
+
+        Returns
+        -------
+
+        self : Feature
+
+        """
+        super().check_type("name", name)
+        super().check_type("value", value)
+        self.name = name
+        self.value = value
+
+    def process(self, df):
+        """
+        Returns the Feature string representation.
 
-        The following column names cannot be used in the formula :
-            - column names that contain the character '{' or '}' 
-            - the empty string ''
-        
         Parameters
         ----------
+
         df : pandas.DataFrame
-            The DataFrame to convert
-        formula : str
-            The formula specifying the desired vowpal wabbit input format. 
+            The dataframe from which to extract a column
+
+        Returns
+        -------
+
+        out : str or pandas.Series
+            The Feature string representation
+
+        """
+        value_col = super().get_col_or_value(self.value, df)
+        if self.name is None:
+            out = value_col
+        else:
+            name_col = super().get_col_or_value(self.name, df)
+            out = name_col + ":" + value_col
+        return out
+
+
+class Namespace(FormulaType):
+    """The Namespace class is used to build a namespace for the DFtoVW class.
+    The Namespace is a container for Feature object(s). Hence, it must
+    be composed of a Feature object or a list of Feature objects.
+    """
+
+    expected_type = dict(
+        name=(str, int, float),
+        value=(int, float),
+        features=(list, Feature),
+    )
+
+    def __init__(self, features, name=None, value=None):
+        """
+        Initialize a Namespace instance.
+
+        Parameters
+        ----------
+
+        features : Feature or list of Feature
+            A (list of) Feature object(s) that will form the namespace
+        name : str/int/float, optional
+            The name of the namespace
+        value : int/float, optional
+            A constant that specify the scaling factor for the features of this
+            namespace.
 
         Examples
         --------
 
-        >>> from vowpalwabbit import DataFrameToVW
+        >>> from pyvw import Namespace, Feature
+        >>> ns_one_feature = Namespace(Feature(Col("a")))
+        >>> ns_multi_features = Namespace([Feature(Col("a")), Feature(Col("b"))])
+        >>> ns_one_feature_with_name = Namespace(Feature(Col("a")),
+                                                 name="FirstNamespace")
+
+        Returns
+        -------
+
+        self: Namespace
+
+        """
+        super().check_type("name", name)
+        super().check_type("value", value)
+        super().check_type("features", features)
+
+        self.features = (
+            list(features) if isinstance(features, (list, set)) else [features]
+        )
+        if (value is not None) and (name is None):
+            raise ValueError(
+                "Namespace cannot have a 'value' argument without a 'name' argument"
+            )
+        self.name = name
+        if value is not None:
+            value = str(value)
+        self.value = value
+
+    def process(self, df=None):
+        """
+        Returns the Namespace string representation
+        """
+        out = ["|"]
+        if self.name is not None:
+            out += str(self.name)
+            if self.value is not None:
+                out += [":", str(self.value)]
+
+        return "".join(out)
+
+
+class DFtoVW:
+    """
+    The DFtoVW is used to convert a pandas DataFrame to a suitable VW format.
+    Instances of this class are build using Col object(s) and subclasses of
+    FormulaType such as SimpleLabel, Feature or Namespace.
+    The class also provided a convenience constructor to initialize the class
+    based on the target/features columns names only.
+    """
+
+    def __init__(self, df, namespaces,
+                 label=None, tag=None, base=None, importance=None):
+        """
+        Initialize a DFtoVW instance
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            The dataframe to convert to VW input format.
+        namespaces : list of Namespace/Namespace
+            One or more Namespace object(s), each of being composed of one or
+            more Feature object(s).
+        label : SimpleLabel
+            The label is the real numbers to be predicted for the examples.
+        importance : SimpleLabel
+            The importance (weight) indicating the relative importance of the
+            examples.
+        tag :  SimpleLabel
+            The tag that is used as identifiers for the examples.
+        base : SimpleLabel
+            The base added to the prediction before computing an update.
+
+        Examples
+        --------
+
+        >>> from vowpalwabbit.pyvw import DFtoVW
         >>> from pandas as pd
-        >>> df = pd.DataFrame({"y": [0], "x": [1]})
-        >>> conv = DataFrameToVW(df, "{y} | {x}")
-        >>> vw_lines = conv.process_df()
-        
-        >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]})
-        >>> conv2 = DataFrameToVW(df2, '{y} |AllFeatures {x} {z}')
-        >>> vw_lines2 = conv2.process_df()
+        >>> df = pd.DataFrame({"y": [1], "a": [2], "b": [3]})
+        >>> conv1 = DFtoVW(df=df,
+                           label=SimpleLabel(Col("y")),
+                           namespaces=Namespace(Feature(name="feat_a", value=Col("a"))))
+        >>> conv1.process_df()
+
+        >>> conv2 = DFtoVW(df=df,
+                           label=SimpleLabel(Col("y")),
+                           namespaces=Namespace(
+                                   name="DoubleIt", value=2,
+                                   features=Feature(name="feat_a", value=Col("a"))))
+        >>> conv2.process_df()
+
+        >>> conv3 = DFtoVW(df=df,
+                           label=SimpleLabel(Col("y")),
+                           namespaces=[Namespace(name="NS1", features=Feature(Col("a"))),
+                                       Namespace(name="NS2", features=Feature(Col("b")))])
+        >>>conv3.process_df()
 
         Returns
         -------
-        self: DataFrameToVW
+
+        self : DFtoVW
         """
         self.df = df
         self.n_rows = df.shape[0]
-        self.column_names = set(df.columns)
-        self.formula = re.sub("\s+", " ", formula).strip()
-        self.check_formula()
-        self.check_absent_cols()
+        self.targets = collections.OrderedDict(
+            label=label, importance=importance, base=base, tag=tag
+        )
+        self.no_tag = tag is not None
+        self.namespaces = (
+            list(namespaces)
+            if isinstance(namespaces, (list, set))
+            else [namespaces]
+        )
+        self.check_targets_type()
+        self.check_namespaces_type()
+        self.check_features_type()
+        self.check_if_cols_exist()
+        self.out = self.empty_col()
+
+    @classmethod
+    def from_colnames(cls, y, x, df, cbb_label=False):
+        """Simple interface to building formula.
+
+
+        Parameters
+        ----------
+        cls : DFtoVW
+            DFtoVW will be initialized using the arguments of this simpler
+            interface
+        y : str/list
+            The column(s) for the label(s)
+        x : str/list
+            The column(s) for the feature(s)
+        df : pandas.DataFrame
+            The dataframe used
+        cbb_label : bool, optional
+            Should be set to True if the label represent contextual bandit
+            label.
+            The default is False.
+
+        Raises
+        ------
+
+        TypeError
+            DESCRIPTION.
+
+        Examples
+        --------
+
+        >>> from vowpalwabbit.pyvw import DFtoVW
+        >>> from pandas as pd
+        >>> df = pd.DataFrame({"y": [1], "x": [2]})
+        >>> conv = DFtoVW.from_colnames(y="y", x="x")
+        >>> conv.process_df()
+
+        Returns
+        -------
+
+        DFtoVW
+            A initialized DFtoVW instance.
 
-    def check_formula(self):
         """
-        Check if formula is of appropriate format
+
+        if isinstance(y, list) and not cbb_label:
+            if len(y) == 1:
+                y = y[0]
+            else:
+                raise ValueError(
+                    "Parameter should be a string or a list of one string"
+                )
+        label = SimpleLabel(Col(y))
+        x = list(x) if isinstance(x, (list, set)) else [x]
+        namespaces = Namespace(
+            features=[Feature(value=Col(colname)) for colname in x]
+        )
+        return cls(namespaces=namespaces, label=label, df=df)
+
+    def check_targets_type(self):
         """
-        match = self.re_check_formula.match(self.formula)
-        valid_formula = match.group() == self.formula
-        if not valid_formula:
-            valid_part = self.formula[: match.end()]
-            invalid_part = self.formula[match.end() :]
-            raise ValueError(
-                "Error parsing formula.\nValid: '{}'\nNot valid: '{}'".format(
-                    valid_part, invalid_part
+        Check targets arguments (label, tag, importance, base) conformity
+
+        Raises
+        ------
+        TypeError
+            If any of the targets element is not of type SimpleLabel
+
+        """
+        wrong_type_targets = [
+            key
+            for (key, value) in self.targets.items()
+            if not isinstance(value, SimpleLabel) and value is not None
+        ]
+        if wrong_type_targets:
+            raise TypeError(
+                "Parameter(s) {} must be of type 'SimpleLabel'".format(
+                    str(wrong_type_targets)[1:-1]
                 )
             )
 
-    def check_absent_cols(self):
+    def check_namespaces_type(self):
         """
-        Helper function that check if any of the column specified in the formula is missing.
-        The function raises value error if any of the column is absent.
-        
+        Check namespaces arguments conformity
+
         Raises
         ------
-        ValueError
-            If the column specified in the formula does not exist in the dataframe
 
+        TypeError
+            If parameters namespaces is not of type Namespace
         """
+        wrong_type_namespaces = [
+            not isinstance(namespace, Namespace)
+            for namespace in self.namespaces
+        ]
+        if any(wrong_type_namespaces):
+            raise TypeError(
+                "Parameter namespaces must be a (list of) Namespace object(s)"
+            )
 
-        all_cols = self.re_parse_col.findall(self.formula)
-        absent_cols = [col for col in all_cols if col not in self.column_names]
-        if any(absent_cols):
-            absent_cols_str = str(absent_cols)[1:-1]
-            raise ValueError(
-                "Column(s) {} not in the DataFrame".format(absent_cols_str)
+    def check_features_type(self):
+        """
+        Check if elements of namespaces are of type features
+
+        Raises
+        ------
+
+        TypeError
+            If parameters any of the element in a 'Namespace' is not of type
+            'Feature'
+
+        """
+        for ns in self.namespaces:
+            features = ns.features
+            wrong_type_features = [
+                not isinstance(feature, Feature) for feature in features
+            ]
+            if any(wrong_type_features):
+                raise TypeError(
+                    "Elements of 'Namespace' object must be of type 'Feature'"
+                )
+
+    def check_if_cols_exist(self):
+        """
+        Check if some columns specified are not in dataframe.
+
+        Raises
+        ------
+
+        ValueError
+            If some columns are not in the dataframe.
+
+        """
+        absent_cols = []
+
+        targets_not_none = [
+            target for target in self.targets.values() if target is not None
+        ]
+        for target in targets_not_none:
+            absent_cols += [
+                x.colname
+                for x in target.get_all_cols()
+                if not x.col_exist(self.df)
+            ]
+
+        for ns in self.namespaces:
+            for feature in ns.features:
+                absent_cols += [
+                    x.colname
+                    for x in feature.get_all_cols()
+                    if not x.col_exist(self.df)
+                ]
+
+        unique_absent_cols = sorted(list(set(absent_cols)))
+        if len(absent_cols) > 0:
+            msg_error = "The following columns do not exist in the dataframe: {}".format(
+                str(unique_absent_cols)[1:-1]
             )
+            raise ValueError(msg_error)
 
-    def process_df(self):
+    def empty_col(self):
         """
-        Convert pandas.DataFrame to a suitable vowpal wabbit input format
+        Create an empty string pandas column.
 
         Returns
         -------
-        out
-            The list of the lines of the DataFrame in vowpal wabbit input format
+
+        pandas.Series
+            A column of empty string with as much rows as the input dataframe.
 
         """
-        matches = list(self.re_parse_col.finditer(self.formula))
-        out = pd.Series([""] * self.n_rows)
+        return pd.Series([""] * self.n_rows)
 
-        current_pos = 0
-        for match in matches:
-            col_name = match.group()[1:-1]
-            start_pos, end_pos = match.span()
-            str_part = self.formula[current_pos:start_pos]
-            value_part = self.df[col_name].apply(str)
-            out += str_part + value_part
-            current_pos = end_pos
-        out += self.formula[current_pos : len(self.formula)]
+    def process_targets(self):
+        """
+        Process the targets into a unique pandas column
 
-        return out.to_list()
+        Returns
+        -------
+
+        out : pandas.Series
+            A column where each row is the processed targets
+
+        """
+        out = self.empty_col()
+
+        for name, value in self.targets.items():
+            if value is not None:
+                to_add = value.process(self.df)
+                out += to_add if (name == "label") else (" " + to_add)
+            elif (value is None) and (name == "tag"):
+                out += " "
+        return out
+
+    def process_features(self, features):
+        """
+        Process the features (of a namespace) into a unique pandas column
+
+        Parameters
+        ----------
+        features : list of Feature
+            The list of Feature objects
+
+        Returns
+        -------
+        out : pandas.series
+            The column of the processed features
+
+        """
+        out = self.empty_col()
+        for feature in features:
+            out += " " + feature.process(self.df)
+        return out
+
+    def process_df(self):
+        """
+        Main method that do the conversion of the dataframe to the VW format
+
+        Returns
+        -------
+        list
+            The list of parsed lines in VW format
+        """
+        if not all(x is None for x in self.targets.values()):
+            self.out += self.process_targets()
+
+        for (num_ns, ns_obj) in enumerate(self.namespaces):
+            to_add = ns_obj.process() + self.process_features(ns_obj.features)
+            self.out += (
+                (to_add + " ")
+                if (num_ns < len(self.namespaces) - 1)
+                else to_add
+            )
 
+        return self.out.to_list()
 

From 895615178cf56feaaa9be4c27df06f735918acc8 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 15 May 2020 17:35:46 +0200
Subject: [PATCH 11/18] remove abc class, did simple functions instead of
 inheriting from FormulaType

---
 python/vowpalwabbit/pyvw.py | 171 +++++++++++++++---------------------
 1 file changed, 72 insertions(+), 99 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 62ac1af8bb8..6412794e236 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -5,7 +5,6 @@
 import pylibvw
 import warnings
 import pandas as pd
-import abc
 import collections
 
 class SearchTask():
@@ -1445,106 +1444,79 @@ def get_col(self, df):
             return out
 
 
-class FormulaType(abc.ABC):
-    """
-    The FormulaType class in an abstract class from which to subclasses the
-    types that will be used in the DFtoVW class.
-    The method 'process' is abstract and must be implemented in the subclass.
-    The class has two concrete implementations 'check_type' and
-    'get_col_or_value'. They are helper functions that are used in subclasses
-    to check the type of the parameters passed when initializing objects and
-    to handle the values passed that can be either a literal (int/float/str) or
-    a Col object.
-    """
+def get_col_or_value(x, df):
+    """Returns the column 'colname' from dataframe 'df' if x is a Col
+    object else returns the value of x convert to string.
 
-    @abc.abstractmethod
-    def process(self, df):
-        """
-        Abstract method that build the subclasses Feature/SimpleLabel/Namespace
-        string representation. If the subclasses are initialized using Col
-        object(s), the result will be a column, otherwise it will be a string.
+    Parameters
+    ----------
 
-        Parameters
-        ----------
-        df : pandas.DataFrame, optional
-            The dataframe from which to extract column.
+    x : Col/str/int/float
+        The Col object or a literal value (str/int/float).
+    df : pandas.DataFrame
+        The dataframe in which to extract the column.
 
-        """
+    Returns
+    -------
+    out : str or pandas.Series
+        A pandas.Series if x is of type 'Col' and a string.
+    """
+    try:
+        out = x.get_col(df)
+    except AttributeError:
+        out = str(x)
+    return out
 
-    def get_col_or_value(self, x, df):
-        """Returns the column 'colname' from dataframe 'df' if x is a Col
-        object else returns the value of x convert to string.
 
-        Parameters
-        ----------
-
-        x : Col/str/int/float
-            The Col object or a literal value (str/int/float).
-        df : pandas.DataFrame
-            The dataframe in which to extract the column.
-        
-        Returns
-        -------
-        out : str or pandas.Series
-            A pandas.Series if x is of type 'Col' and a string.
-        """
-        try:
-            out = x.get_col(df)
-        except AttributeError:
-            out = str(x)
-        return out
+def get_all_cols(obj):
+    """
+    Returns attributes of an instance that are of type Col. Note that this
+    method search for Col just in the attributes of the instances and will
+    not search in the attributes of the attributes (no recursive search).
 
-    def get_all_cols(self):
-        """
-        Returns attributes of an instance that are of type Col. Note that this
-        method search for Col just in the attributes of the instances and will
-        not search in the attributes of the attributes (no recursive search).
+    Returns
+    -------
 
-        Returns
-        -------
+    out : list of Col
+        The list of Col objects in the instance
 
-        out : list of Col
-            The list of Col objects in the instance
+    """
+    attr_values = list(vars(obj).values())
+    out = [x for x in attr_values if isinstance(x, Col)]
+    return out
 
-        """
-        attr_values = list(vars(self).values())
-        out = [x for x in attr_values if isinstance(x, Col)]
-        return out
 
-    def check_type(self, arg_name, arg_value):
-        """
-        Check if the type of an argument is valid. Typically used in the
-        __init__ method of the subclasses to check arguments conformity.
-        Using this method requires that a dict 'expected_type' exists in the
-        class of the calling instance.
+def check_type(obj, expected_type):
+    """
+    Check if an object is of valid type.
 
-        Parameters
-        ----------
+    Parameters
+    ----------
 
-        arg_name : str
-            The argument name.
-        arg_value : object
-            The argument value to check.
+    arg : obj
+        The object to check
+    expected_type: type or tuple of types
+        The types to check against
 
-        Raises
-        ------
+    Raises
+    ------
 
-        TypeError
-            If the argument is not of a valid type.
-        """
-        expected_type_str = str(
-            [x.__name__ for x in self.expected_type[arg_name]]
-        )
-        if arg_value is not None:
-            if not isinstance(arg_value, self.expected_type[arg_name]):
-                raise TypeError(
-                    "Parameter {} shoud be of class {}".format(
-                        arg_name, expected_type_str[1:-1]
-                    )
+    TypeError
+        If the argument is not of a valid type.
+    """
+    expected_type_str = str(
+        [x.__name__ for x in expected_type]
+    )
+    if obj is not None:
+        if not isinstance(obj, expected_type):
+            raise TypeError(
+                "Parameter {} shoud be of class {}".format(
+                    obj, expected_type_str[1:-1]
                 )
+            )
 
 
-class SimpleLabel(FormulaType):
+class SimpleLabel:
     """The SimpleLabel class is used to build a simple label that will be plug
     to build the parameters of the DFtoVW class.
     """
@@ -1568,7 +1540,7 @@ def __init__(self, name):
         self : SimpleLabel
 
         """
-        super().check_type("name", name)
+        check_type(name, self.expected_type["name"])
         self.name = name
 
     def process(self, df):
@@ -1587,10 +1559,10 @@ def process(self, df):
         str or pandas.Series
             The SimpleLabel string representation.
         """
-        return super().get_col_or_value(self.name, df)
+        return get_col_or_value(self.name, df)
 
 
-class Feature(FormulaType):
+class Feature:
     """The Feature class is used to build a feature for the DFtoVW class"""
 
     expected_type = dict(
@@ -1616,8 +1588,8 @@ def __init__(self, value, name=None):
         self : Feature
 
         """
-        super().check_type("name", name)
-        super().check_type("value", value)
+        for (arg, name_arg) in zip([name, value], ["name", "value"]) :
+            check_type(arg, self.expected_type[name_arg])
         self.name = name
         self.value = value
 
@@ -1638,16 +1610,16 @@ def process(self, df):
             The Feature string representation
 
         """
-        value_col = super().get_col_or_value(self.value, df)
+        value_col = get_col_or_value(self.value, df)
         if self.name is None:
             out = value_col
         else:
-            name_col = super().get_col_or_value(self.name, df)
+            name_col = get_col_or_value(self.name, df)
             out = name_col + ":" + value_col
         return out
 
 
-class Namespace(FormulaType):
+class Namespace:
     """The Namespace class is used to build a namespace for the DFtoVW class.
     The Namespace is a container for Feature object(s). Hence, it must
     be composed of a Feature object or a list of Feature objects.
@@ -1689,9 +1661,8 @@ def __init__(self, features, name=None, value=None):
         self: Namespace
 
         """
-        super().check_type("name", name)
-        super().check_type("value", value)
-        super().check_type("features", features)
+        for (arg, name_arg) in zip([name, value, features], ["name", "value", "features"]) :
+            check_type(arg, self.expected_type[name_arg])
 
         self.features = (
             list(features) if isinstance(features, (list, set)) else [features]
@@ -1721,8 +1692,8 @@ def process(self, df=None):
 class DFtoVW:
     """
     The DFtoVW is used to convert a pandas DataFrame to a suitable VW format.
-    Instances of this class are build using Col object(s) and subclasses of
-    FormulaType such as SimpleLabel, Feature or Namespace.
+    Instances of this class are build using Col object(s) and classes such as
+    SimpleLabel, Feature or Namespace.
     The class also provided a convenience constructor to initialize the class
     based on the target/features columns names only.
     """
@@ -1847,6 +1818,7 @@ def from_colnames(cls, y, x, df, cbb_label=False):
                 raise ValueError(
                     "Parameter should be a string or a list of one string"
                 )
+
         label = SimpleLabel(Col(y))
         x = list(x) if isinstance(x, (list, set)) else [x]
         namespaces = Namespace(
@@ -1936,7 +1908,7 @@ def check_if_cols_exist(self):
         for target in targets_not_none:
             absent_cols += [
                 x.colname
-                for x in target.get_all_cols()
+                for x in get_all_cols(target)
                 if not x.col_exist(self.df)
             ]
 
@@ -1944,7 +1916,7 @@ def check_if_cols_exist(self):
             for feature in ns.features:
                 absent_cols += [
                     x.colname
-                    for x in feature.get_all_cols()
+                    for x in get_all_cols(feature)
                     if not x.col_exist(self.df)
                 ]
 
@@ -2031,3 +2003,4 @@ def process_df(self):
 
         return self.out.to_list()
 
+

From f4329c3b99a1a333064a487c10b24683e3bbf00c Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 15 May 2020 17:57:13 +0200
Subject: [PATCH 12/18] fix typo on import DFtoVW class

---
 python/tests/test_pyvw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index c6a49f70408..949126ae6f8 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -2,7 +2,7 @@
 
 from vowpalwabbit import pyvw
 from vowpalwabbit.pyvw import vw
-from vowpalwabbit.pyvw import DataFrameToVW, SimpleLabel, Feature, Namespace, Col
+from vowpalwabbit.pyvw import DFtoVW, SimpleLabel, Feature, Namespace, Col
 import pytest
 import pandas as pd
 

From d455cce1705efd22183cf810549f7ffc80a3bd2a Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 15 May 2020 18:55:10 +0200
Subject: [PATCH 13/18] handle the different init for OrderedDict in python 2.7

---
 python/vowpalwabbit/pyvw.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 6412794e236..cc280a4c029 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1752,9 +1752,10 @@ def __init__(self, df, namespaces,
         """
         self.df = df
         self.n_rows = df.shape[0]
-        self.targets = collections.OrderedDict(
-            label=label, importance=importance, base=base, tag=tag
-        )
+        self.targets = collections.OrderedDict()
+        for (key, value) in zip(["label", "importance", "base", "tag"],
+                                [label, importance, base, tag]):
+            self.targets[key] = value
         self.no_tag = tag is not None
         self.namespaces = (
             list(namespaces)
@@ -1832,6 +1833,7 @@ def check_targets_type(self):
 
         Raises
         ------
+
         TypeError
             If any of the targets element is not of type SimpleLabel
 
@@ -1967,11 +1969,13 @@ def process_features(self, features):
 
         Parameters
         ----------
+
         features : list of Feature
             The list of Feature objects
 
         Returns
         -------
+
         out : pandas.series
             The column of the processed features
 
@@ -1987,6 +1991,7 @@ def process_df(self):
 
         Returns
         -------
+
         list
             The list of parsed lines in VW format
         """

From e1f1f567a706662ac321314566796d849ed53687 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Thu, 21 May 2020 15:39:43 +0200
Subject: [PATCH 14/18] clean docstring and fix typos, add undescore for
 internal function

---
 python/vowpalwabbit/pyvw.py | 242 ++++++++++++++----------------------
 1 file changed, 96 insertions(+), 146 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index cc280a4c029..9f41c9b2c15 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1359,33 +1359,31 @@ def get_label(self, label_class=simple_label):
 
 
 class Col:
-    """Col is a convenience class to refer to a column of a dataframe.
-    Its methods can:
+    """Refer to a column of a dataframe.
+    The methods of this class are used to:
         - check if the column is in a specified dataframe
         - extract the column from the specified dataframe
     """
 
     def __init__(self, colname):
-        """
-        Initialize a Col object
+        """Initialize a Col object.
 
         Parameters
         ----------
 
         colname : str
-            The colname that refers to a column
+            The colname that refers to a column.
 
         Raises
         ------
 
         TypeError
-            If attribute 'colname' is not a string
+            If attribute 'colname' is not a string.
 
         Returns
         -------
 
         self : Col
-
         """
         if isinstance(colname, str):
             self.colname = colname
@@ -1393,33 +1391,30 @@ def __init__(self, colname):
             raise TypeError("'colname' must be a string")
 
     def col_exist(self, df):
-        """
-        Check if the column 'colname' is in a dataframe 'df'
+        """Check if the column 'colname' is in a dataframe 'df'.
 
         Parameters
         ----------
 
         df : pandas.DataFrame
-            The dataframe in which to look for the column
+            The dataframe in which to look for the column.
 
         Returns
         -------
 
         bool
             True if the column is in the dataframe, False otherwise.
-
         """
         return self.colname in df
 
     def get_col(self, df):
-        """
-        Extract the column 'colname' from the dataframe 'df'
+        """Extract the column 'colname' from the dataframe 'df'.
 
         Parameters
         ----------
 
         df : pandas.DataFrame
-            The dataframe from which to extract the column 'colname'
+            The dataframe from which to extract the column 'colname'.
 
         Raises
         ------
@@ -1432,7 +1427,6 @@ def get_col(self, df):
 
         out : pandas.Series
             The column extracted from the dataframe.
-
         """
         try:
             out = df[self.colname].fillna("").apply(str)
@@ -1444,9 +1438,9 @@ def get_col(self, df):
             return out
 
 
-def get_col_or_value(x, df):
+def _get_col_or_value(x, df):
     """Returns the column 'colname' from dataframe 'df' if x is a Col
-    object else returns the value of x convert to string.
+    object else returns the value of x converted to string.
 
     Parameters
     ----------
@@ -1458,8 +1452,9 @@ def get_col_or_value(x, df):
 
     Returns
     -------
+
     out : str or pandas.Series
-        A pandas.Series if x is of type 'Col' and a string.
+        A pandas.Series if x is of type 'Col' or a string otherwise.
     """
     try:
         out = x.get_col(df)
@@ -1468,35 +1463,32 @@ def get_col_or_value(x, df):
     return out
 
 
-def get_all_cols(obj):
-    """
-    Returns attributes of an instance that are of type Col. Note that this
-    method search for Col just in the attributes of the instances and will
-    not search in the attributes of the attributes (no recursive search).
+def _get_all_cols(obj):
+    """Returns the attributes of type Col of a given instance. Note that this
+    method won't search for Col types in the attributes of the attributes
+    (no recursive search).
 
     Returns
     -------
 
-    out : list of Col
-        The list of Col objects in the instance
-
+    out : list (of Col)
+        The list of Col objects in the instance.
     """
     attr_values = list(vars(obj).values())
     out = [x for x in attr_values if isinstance(x, Col)]
     return out
 
 
-def check_type(obj, expected_type):
-    """
-    Check if an object is of valid type.
+def _check_type(obj, expected_type):
+    """Check if the type of an object is valid.
 
     Parameters
     ----------
 
-    arg : obj
-        The object to check
-    expected_type: type or tuple of types
-        The types to check against
+    obj : object
+        The object to check.
+    expected_type : type or tuple of types
+        The type(s) to check against.
 
     Raises
     ------
@@ -1504,53 +1496,48 @@ def check_type(obj, expected_type):
     TypeError
         If the argument is not of a valid type.
     """
-    expected_type_str = str(
-        [x.__name__ for x in expected_type]
-    )
+    expected_type_str = str([x.__name__ for x in expected_type])
     if obj is not None:
         if not isinstance(obj, expected_type):
             raise TypeError(
-                "Parameter {} shoud be of class {}".format(
+                "Parameter {} should be of type(s) {}".format(
                     obj, expected_type_str[1:-1]
                 )
             )
 
 
 class SimpleLabel:
-    """The SimpleLabel class is used to build a simple label that will be plug
-    to build the parameters of the DFtoVW class.
+    """The SimpleLabel class is used to build a simple label for the
+    constructor of DFtoVW.
     """
 
     expected_type = dict(name=(Col, str, int, float))
 
     def __init__(self, name):
-        """
-        Initialize a SimpleLabel instance.
+        """Initialize a SimpleLabel instance.
 
         Parameters
         ----------
 
-        name : Col/int/float/str
+        name : Col/str/int/float
             A Col object specifying the column to extract from a dataframe or a
-            constant value of type int/float/str.
+            constant value of type str/int/float.
 
         Returns
         -------
 
         self : SimpleLabel
-
         """
-        check_type(name, self.expected_type["name"])
+        _check_type(name, self.expected_type["name"])
         self.name = name
 
     def process(self, df):
-        """
-        Returns the SimpleLabel string representation.
+        """Returns the SimpleLabel string representation.
 
         Parameters
         ----------
 
-        df: pandas.DataFrame
+        df : pandas.DataFrame
             The dataframe from which to extract a column.
 
         Returns
@@ -1559,15 +1546,14 @@ def process(self, df):
         str or pandas.Series
             The SimpleLabel string representation.
         """
-        return get_col_or_value(self.name, df)
+        return _get_col_or_value(self.name, df)
 
 
 class Feature:
-    """The Feature class is used to build a feature for the DFtoVW class"""
+    """A feature for the constructor of DFtoVW"""
 
     expected_type = dict(
-        name=(Col, str, float, int),
-        value=(Col, str, float, int)
+        name=(Col, str, float, int), value=(Col, str, float, int)
     )
 
     def __init__(self, value, name=None):
@@ -1586,62 +1572,56 @@ def __init__(self, value, name=None):
         -------
 
         self : Feature
-
         """
-        for (arg, name_arg) in zip([name, value], ["name", "value"]) :
-            check_type(arg, self.expected_type[name_arg])
+        for (arg, name_arg) in zip([name, value], ["name", "value"]):
+            _check_type(arg, self.expected_type[name_arg])
         self.name = name
         self.value = value
 
     def process(self, df):
-        """
-        Returns the Feature string representation.
+        """Returns the Feature string representation.
 
         Parameters
         ----------
 
         df : pandas.DataFrame
-            The dataframe from which to extract a column
+            The dataframe from which to extract a column.
 
         Returns
         -------
 
         out : str or pandas.Series
-            The Feature string representation
-
+            The Feature string representation.
         """
-        value_col = get_col_or_value(self.value, df)
+        value_col = _get_col_or_value(self.value, df)
         if self.name is None:
             out = value_col
         else:
-            name_col = get_col_or_value(self.name, df)
+            name_col = _get_col_or_value(self.name, df)
             out = name_col + ":" + value_col
         return out
 
 
 class Namespace:
-    """The Namespace class is used to build a namespace for the DFtoVW class.
-    The Namespace is a container for Feature object(s). Hence, it must
+    """A namespace for the constructor of DFtoVW.
+    The Namespace is a container for Feature object(s), and thus must
     be composed of a Feature object or a list of Feature objects.
     """
 
     expected_type = dict(
-        name=(str, int, float),
-        value=(int, float),
-        features=(list, Feature),
+        name=(str, int, float), value=(int, float), features=(list, Feature),
     )
 
     def __init__(self, features, name=None, value=None):
-        """
-        Initialize a Namespace instance.
+        """Initialize a Namespace instance.
 
         Parameters
         ----------
 
         features : Feature or list of Feature
-            A (list of) Feature object(s) that will form the namespace
+            A (list of) Feature object(s) that form the namespace.
         name : str/int/float, optional
-            The name of the namespace
+            The name of the namespace.
         value : int/float, optional
             A constant that specify the scaling factor for the features of this
             namespace.
@@ -1658,18 +1638,18 @@ def __init__(self, features, name=None, value=None):
         Returns
         -------
 
-        self: Namespace
-
+        self : Namespace
         """
-        for (arg, name_arg) in zip([name, value, features], ["name", "value", "features"]) :
-            check_type(arg, self.expected_type[name_arg])
+        for (arg, name_arg) in zip([name, value, features],
+                                   ["name", "value", "features"]):
+            _check_type(arg, self.expected_type[name_arg])
 
         self.features = (
             list(features) if isinstance(features, (list, set)) else [features]
         )
         if (value is not None) and (name is None):
             raise ValueError(
-                "Namespace cannot have a 'value' argument without a 'name' argument"
+                "Namespace can't have a 'value' argument without a 'name' argument"
             )
         self.name = name
         if value is not None:
@@ -1677,9 +1657,7 @@ def __init__(self, features, name=None, value=None):
         self.value = value
 
     def process(self, df=None):
-        """
-        Returns the Namespace string representation
-        """
+        """Returns the Namespace string representation"""
         out = ["|"]
         if self.name is not None:
             out += str(self.name)
@@ -1690,18 +1668,15 @@ def process(self, df=None):
 
 
 class DFtoVW:
-    """
-    The DFtoVW is used to convert a pandas DataFrame to a suitable VW format.
-    Instances of this class are build using Col object(s) and classes such as
-    SimpleLabel, Feature or Namespace.
+    """Convert a pandas DataFrame to a suitable VW format.
+    Instances of this class are built with classes such as SimpleLabel, Feature
+    or Namespace (that can themselves be built on Col object(s)).
     The class also provided a convenience constructor to initialize the class
-    based on the target/features columns names only.
+    based on the target/features column names only.
     """
 
-    def __init__(self, df, namespaces,
-                 label=None, tag=None, base=None, importance=None):
-        """
-        Initialize a DFtoVW instance
+    def __init__(self, df, namespaces, label=None, tag=None):
+        """Initialize a DFtoVW instance.
 
         Parameters
         ----------
@@ -1713,13 +1688,8 @@ def __init__(self, df, namespaces,
             more Feature object(s).
         label : SimpleLabel
             The label is the real numbers to be predicted for the examples.
-        importance : SimpleLabel
-            The importance (weight) indicating the relative importance of the
-            examples.
-        tag :  SimpleLabel
+        tag :  str
             The tag that is used as identifiers for the examples.
-        base : SimpleLabel
-            The base added to the prediction before computing an update.
 
         Examples
         --------
@@ -1753,8 +1723,7 @@ def __init__(self, df, namespaces,
         self.df = df
         self.n_rows = df.shape[0]
         self.targets = collections.OrderedDict()
-        for (key, value) in zip(["label", "importance", "base", "tag"],
-                                [label, importance, base, tag]):
+        for (key, value) in zip(["label", "tag"], [label, tag]):
             self.targets[key] = value
         self.no_tag = tag is not None
         self.namespaces = (
@@ -1769,25 +1738,19 @@ def __init__(self, df, namespaces,
         self.out = self.empty_col()
 
     @classmethod
-    def from_colnames(cls, y, x, df, cbb_label=False):
-        """Simple interface to building formula.
+    def from_colnames(cls, y, x, df):
+        """Build DFtoVW instance using column names only.
 
 
         Parameters
         ----------
-        cls : DFtoVW
-            DFtoVW will be initialized using the arguments of this simpler
-            interface
+
         y : str/list
-            The column(s) for the label(s)
+            The column for the label.
         x : str/list
-            The column(s) for the feature(s)
+            The column(s) for the feature(s).
         df : pandas.DataFrame
-            The dataframe used
-        cbb_label : bool, optional
-            Should be set to True if the label represent contextual bandit
-            label.
-            The default is False.
+            The dataframe used.
 
         Raises
         ------
@@ -1809,10 +1772,9 @@ def from_colnames(cls, y, x, df, cbb_label=False):
 
         DFtoVW
             A initialized DFtoVW instance.
-
         """
 
-        if isinstance(y, list) and not cbb_label:
+        if isinstance(y, list):
             if len(y) == 1:
                 y = y[0]
             else:
@@ -1828,15 +1790,13 @@ def from_colnames(cls, y, x, df, cbb_label=False):
         return cls(namespaces=namespaces, label=label, df=df)
 
     def check_targets_type(self):
-        """
-        Check targets arguments (label, tag, importance, base) conformity
+        """Check targets arguments (label, tag) conformity.
 
         Raises
         ------
 
         TypeError
-            If any of the targets element is not of type SimpleLabel
-
+            If any of the targets element is not of type SimpleLabel.
         """
         wrong_type_targets = [
             key
@@ -1851,14 +1811,13 @@ def check_targets_type(self):
             )
 
     def check_namespaces_type(self):
-        """
-        Check namespaces arguments conformity
+        """Check if namespaces arguments are of type Namespace.
 
         Raises
         ------
 
         TypeError
-            If parameters namespaces is not of type Namespace
+            If namespaces are not of type Namespace.
         """
         wrong_type_namespaces = [
             not isinstance(namespace, Namespace)
@@ -1866,20 +1825,19 @@ def check_namespaces_type(self):
         ]
         if any(wrong_type_namespaces):
             raise TypeError(
-                "Parameter namespaces must be a (list of) Namespace object(s)"
+                "Argument `namespaces` should be "
+                "a Namespace object or a list of Namespace objects"
             )
 
     def check_features_type(self):
-        """
-        Check if elements of namespaces are of type features
+        """Check if 'features' attribute of namespaces are of type Feature.
 
         Raises
         ------
 
         TypeError
             If parameters any of the element in a 'Namespace' is not of type
-            'Feature'
-
+            'Feature'.
         """
         for ns in self.namespaces:
             features = ns.features
@@ -1888,19 +1846,19 @@ def check_features_type(self):
             ]
             if any(wrong_type_features):
                 raise TypeError(
-                    "Elements of 'Namespace' object must be of type 'Feature'"
+                    "Argument 'features' of Namespace should be "
+                    "a Feature object or a list of Feature objects"
                 )
 
     def check_if_cols_exist(self):
-        """
-        Check if some columns specified are not in dataframe.
+        """Check if the columns specified in the constructor are in the
+        dataframe.
 
         Raises
         ------
 
         ValueError
-            If some columns are not in the dataframe.
-
+            If one or more columns are not in the dataframe.
         """
         absent_cols = []
 
@@ -1910,7 +1868,7 @@ def check_if_cols_exist(self):
         for target in targets_not_none:
             absent_cols += [
                 x.colname
-                for x in get_all_cols(target)
+                for x in _get_all_cols(target)
                 if not x.col_exist(self.df)
             ]
 
@@ -1918,7 +1876,7 @@ def check_if_cols_exist(self):
             for feature in ns.features:
                 absent_cols += [
                     x.colname
-                    for x in get_all_cols(feature)
+                    for x in _get_all_cols(feature)
                     if not x.col_exist(self.df)
                 ]
 
@@ -1930,28 +1888,24 @@ def check_if_cols_exist(self):
             raise ValueError(msg_error)
 
     def empty_col(self):
-        """
-        Create an empty string pandas column.
+        """Create an empty string column.
 
         Returns
         -------
 
         pandas.Series
             A column of empty string with as much rows as the input dataframe.
-
         """
         return pd.Series([""] * self.n_rows)
 
     def process_targets(self):
-        """
-        Process the targets into a unique pandas column
+        """Process the targets into a unique column.
 
         Returns
         -------
 
         out : pandas.Series
-            A column where each row is the processed targets
-
+            A column where each row is the processed targets.
         """
         out = self.empty_col()
 
@@ -1964,21 +1918,19 @@ def process_targets(self):
         return out
 
     def process_features(self, features):
-        """
-        Process the features (of a namespace) into a unique pandas column
+        """Process the features (of a namespace) into a unique column.
 
         Parameters
         ----------
 
         features : list of Feature
-            The list of Feature objects
+            The list of Feature objects.
 
         Returns
         -------
 
-        out : pandas.series
-            The column of the processed features
-
+        out : pandas.Series
+            The column of the processed features.
         """
         out = self.empty_col()
         for feature in features:
@@ -1986,14 +1938,13 @@ def process_features(self, features):
         return out
 
     def process_df(self):
-        """
-        Main method that do the conversion of the dataframe to the VW format
+        """Main method that converts the pandas dataframe to the VW format.
 
         Returns
         -------
 
         list
-            The list of parsed lines in VW format
+            The list of parsed lines in VW format.
         """
         if not all(x is None for x in self.targets.values()):
             self.out += self.process_targets()
@@ -2008,4 +1959,3 @@ def process_df(self):
 
         return self.out.to_list()
 
-

From 8fff16841152a5e59b2606a99f76046b02b988c0 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 22 May 2020 13:07:06 +0200
Subject: [PATCH 15/18] simplify tag parameter, add type checking for
 'from_colnames' constructor, make not found columns method more explicit

---
 python/tests/test_pyvw.py   |  13 +-
 python/vowpalwabbit/pyvw.py | 249 +++++++++++++++++++++++++++++-------
 2 files changed, 208 insertions(+), 54 deletions(-)

diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
index 949126ae6f8..26bf746a92b 100644
--- a/python/tests/test_pyvw.py
+++ b/python/tests/test_pyvw.py
@@ -359,7 +359,7 @@ def test_feature_column_renaming_and_tag():
     df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
     conv = DFtoVW(
         label=SimpleLabel(Col("y")),
-        tag=SimpleLabel(Col("idx")),
+        tag=Col("idx"),
         namespaces=Namespace([Feature(name="col_x", value=Col("x"))]),
         df=df,
     )
@@ -371,7 +371,7 @@ def test_feature_constant_column_with_empty_name():
     df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
     conv = DFtoVW(
         label=SimpleLabel(Col("y")),
-        tag=SimpleLabel(Col("idx")),
+        tag=Col("idx"),
         namespaces=Namespace([Feature(name="", value=2)]),
         df=df,
     )
@@ -429,13 +429,12 @@ def test_absent_col_error():
         df = pd.DataFrame({"a": [1]})
         conv = DFtoVW(
             df=df,
-            label=SimpleLabel(Col("b")),
+            label=SimpleLabel(Col("a")),
             namespaces=Namespace(
-                [Feature(Col("b")), Feature(Col("c")), Feature("d")]
+                [Feature(Col("a")), Feature(Col("c")), Feature("d")]
             ),
         )
-    expected = "The following columns do not exist in the dataframe: '{}', '{}'".format(
-        "b", "c"
-    )
+    expected = "In argument 'features', column(s) 'c' not found in dataframe"
     assert expected == str(value_error.value)
 
+
diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 9f41c9b2c15..966e41f3d1e 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1506,6 +1506,154 @@ def _check_type(obj, expected_type):
             )
 
 
+class Col:
+    """Refer to a column of a dataframe.
+    The methods of this class are used to:
+        - check if the column is in a specified dataframe
+        - extract the column from the specified dataframe
+    """
+
+    def __init__(self, colname):
+        """Initialize a Col object.
+
+        Parameters
+        ----------
+
+        colname : str
+            The colname that refers to a column.
+
+        Raises
+        ------
+
+        TypeError
+            If attribute 'colname' is not a string.
+
+        Returns
+        -------
+
+        self : Col
+        """
+        if isinstance(colname, str):
+            self.colname = colname
+        else:
+            raise TypeError("'colname' must be a string")
+
+    def col_exist(self, df):
+        """Check if the column 'colname' is in a dataframe 'df'.
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            The dataframe in which to look for the column.
+
+        Returns
+        -------
+
+        bool
+            True if the column is in the dataframe, False otherwise.
+        """
+        return self.colname in df
+
+    def get_col(self, df):
+        """Extract the column 'colname' from the dataframe 'df'.
+
+        Parameters
+        ----------
+
+        df : pandas.DataFrame
+            The dataframe from which to extract the column 'colname'.
+
+        Raises
+        ------
+
+        KeyError
+            If the column is not found in the dataframe.
+
+        Returns
+        -------
+
+        out : pandas.Series
+            The column extracted from the dataframe.
+        """
+        try:
+            out = df[self.colname].fillna("").apply(str)
+        except KeyError:
+            raise KeyError(
+                "Column '{}' not found in dataframe".format(self.colname)
+            )
+        else:
+            return out
+
+
+def _get_col_or_value(x, df):
+    """Returns the column 'colname' from dataframe 'df' if x is a Col
+    object else returns the value of x converted to string.
+
+    Parameters
+    ----------
+
+    x : Col/str/int/float
+        The Col object or a literal value (str/int/float).
+    df : pandas.DataFrame
+        The dataframe in which to extract the column.
+
+    Returns
+    -------
+
+    out : str or pandas.Series
+        A pandas.Series if x is of type 'Col' or a string otherwise.
+    """
+    try:
+        out = x.get_col(df)
+    except AttributeError:
+        out = str(x)
+    return out
+
+
+def _get_all_cols(obj):
+    """Returns the attributes of type Col of a given instance. Note that this
+    method won't search for Col types in the attributes of the attributes
+    (no recursive search).
+
+    Returns
+    -------
+
+    out : list (of Col)
+        The list of Col objects in the instance.
+    """
+    attr_values = list(vars(obj).values())
+    out = [x for x in attr_values if isinstance(x, Col)]
+    return out
+
+
+def _check_type(obj, expected_type):
+    """Check if the type of an object is valid.
+
+    Parameters
+    ----------
+
+    obj : object
+        The object to check.
+    expected_type : type or tuple of types
+        The type(s) to check against.
+
+    Raises
+    ------
+
+    TypeError
+        If the argument is not of a valid type.
+    """
+    expected_type_str = str([x.__name__ for x in expected_type])
+    if obj is not None:
+        if not isinstance(obj, expected_type):
+            raise TypeError(
+                "Parameter {} should be of type(s) {}".format(
+                    obj, expected_type_str[1:-1]
+                )
+            )
+
+
 class SimpleLabel:
     """The SimpleLabel class is used to build a simple label for the
     constructor of DFtoVW.
@@ -1688,7 +1836,7 @@ def __init__(self, df, namespaces, label=None, tag=None):
             more Feature object(s).
         label : SimpleLabel
             The label is the real numbers to be predicted for the examples.
-        tag :  str
+        tag :  Col or str
             The tag that is used as identifiers for the examples.
 
         Examples
@@ -1722,16 +1870,14 @@ def __init__(self, df, namespaces, label=None, tag=None):
         """
         self.df = df
         self.n_rows = df.shape[0]
-        self.targets = collections.OrderedDict()
-        for (key, value) in zip(["label", "tag"], [label, tag]):
-            self.targets[key] = value
-        self.no_tag = tag is not None
+        self.label = label
+        self.tag = tag
         self.namespaces = (
             list(namespaces)
             if isinstance(namespaces, (list, set))
             else [namespaces]
         )
-        self.check_targets_type()
+        self.check_label_type()
         self.check_namespaces_type()
         self.check_features_type()
         self.check_if_cols_exist()
@@ -1756,7 +1902,7 @@ def from_colnames(cls, y, x, df):
         ------
 
         TypeError
-            DESCRIPTION.
+            If argument label is a list of multiple strings
 
         Examples
         --------
@@ -1779,8 +1925,14 @@ def from_colnames(cls, y, x, df):
                 y = y[0]
             else:
                 raise ValueError(
-                    "Parameter should be a string or a list of one string"
+                    "Parameter should a list of one string (or a string)"
                 )
+        if not isinstance(x, str):
+            raise TypeError("Argument 'x' should be a string")
+        if not isinstance(x, str):
+            raise TypeError(
+                "Argument 'y' should be a string or a list of one string"
+            )
 
         label = SimpleLabel(Col(y))
         x = list(x) if isinstance(x, (list, set)) else [x]
@@ -1789,26 +1941,17 @@ def from_colnames(cls, y, x, df):
         )
         return cls(namespaces=namespaces, label=label, df=df)
 
-    def check_targets_type(self):
-        """Check targets arguments (label, tag) conformity.
+    def check_label_type(self):
+        """Check label argument conformity.
 
         Raises
         ------
 
         TypeError
-            If any of the targets element is not of type SimpleLabel.
+            If label is not of type SimpleLabel.
         """
-        wrong_type_targets = [
-            key
-            for (key, value) in self.targets.items()
-            if not isinstance(value, SimpleLabel) and value is not None
-        ]
-        if wrong_type_targets:
-            raise TypeError(
-                "Parameter(s) {} must be of type 'SimpleLabel'".format(
-                    str(wrong_type_targets)[1:-1]
-                )
-            )
+        if not isinstance(self.label, SimpleLabel) and self.label is not None:
+            raise TypeError("Argument 'label' must be of type 'SimpleLabel'")
 
     def check_namespaces_type(self):
         """Check if namespaces arguments are of type Namespace.
@@ -1860,31 +2003,46 @@ def check_if_cols_exist(self):
         ValueError
             If one or more columns are not in the dataframe.
         """
-        absent_cols = []
+        absent_cols = {}
 
-        targets_not_none = [
-            target for target in self.targets.values() if target is not None
-        ]
-        for target in targets_not_none:
-            absent_cols += [
+        if self.label is not None:
+            absent_cols["label"] = [
                 x.colname
-                for x in _get_all_cols(target)
+                for x in _get_all_cols(self.label)
                 if not x.col_exist(self.df)
             ]
 
+        if self.tag is not None:
+            if isinstance(self.tag, Col) and not self.tag.col_exist(self.df):
+                absent_cols["tag"] = repr(self.tag.colname)
+
+        missing_features_cols = []
         for ns in self.namespaces:
             for feature in ns.features:
-                absent_cols += [
+                missing_features_cols += [
                     x.colname
                     for x in _get_all_cols(feature)
                     if not x.col_exist(self.df)
                 ]
-
-        unique_absent_cols = sorted(list(set(absent_cols)))
-        if len(absent_cols) > 0:
-            msg_error = "The following columns do not exist in the dataframe: {}".format(
-                str(unique_absent_cols)[1:-1]
+        absent_cols["features"] = sorted(list(set(missing_features_cols)))
+
+        absent_cols = {
+            key: value for (key, value) in absent_cols.items() if len(value) > 0
+        }
+        msg_error = ""
+        for arg_name, missing_cols in absent_cols.items():
+            missing_cols = (
+                repr(missing_cols)[1:-1]
+                if isinstance(missing_cols, list)
+                else missing_cols
+            )
+            if len(msg_error) > 0:
+                msg_error += "\n"
+            msg_error += "In argument '{}', column(s) {} not found in dataframe".format(
+                arg_name, missing_cols
             )
+
+        if absent_cols:
             raise ValueError(msg_error)
 
     def empty_col(self):
@@ -1898,23 +2056,20 @@ def empty_col(self):
         """
         return pd.Series([""] * self.n_rows)
 
-    def process_targets(self):
-        """Process the targets into a unique column.
+    def process_label_and_value(self):
+        """Process the label and value into a unique column.
 
         Returns
         -------
 
         out : pandas.Series
-            A column where each row is the processed targets.
+            A column where each row is the processed label and value.
         """
         out = self.empty_col()
-
-        for name, value in self.targets.items():
-            if value is not None:
-                to_add = value.process(self.df)
-                out += to_add if (name == "label") else (" " + to_add)
-            elif (value is None) and (name == "tag"):
-                out += " "
+        if self.label is not None:
+            out += self.label.process(self.df) + " "
+        if self.tag is not None:
+            out += _get_col_or_value(self.tag, self.df)
         return out
 
     def process_features(self, features):
@@ -1946,8 +2101,8 @@ def process_df(self):
         list
             The list of parsed lines in VW format.
         """
-        if not all(x is None for x in self.targets.values()):
-            self.out += self.process_targets()
+        if not all(x is None for x in [self.label, self.tag]):
+            self.out += self.process_label_and_value()
 
         for (num_ns, ns_obj) in enumerate(self.namespaces):
             to_add = ns_obj.process() + self.process_features(ns_obj.features)

From ac6bd4ef7e76643934aaaeb8d11630b2200f6fe5 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Fri, 22 May 2020 17:52:29 +0200
Subject: [PATCH 16/18] fix type checking for x in 'from_colnames' constructor,
 remove unused collections import

---
 python/vowpalwabbit/pyvw.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 966e41f3d1e..a7f78a20dd0 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -5,7 +5,6 @@
 import pylibvw
 import warnings
 import pandas as pd
-import collections
 
 class SearchTask():
     """Search task class"""
@@ -1927,15 +1926,19 @@ def from_colnames(cls, y, x, df):
                 raise ValueError(
                     "Parameter should a list of one string (or a string)"
                 )
-        if not isinstance(x, str):
-            raise TypeError("Argument 'x' should be a string")
-        if not isinstance(x, str):
+        if not isinstance(y, str):
             raise TypeError(
                 "Argument 'y' should be a string or a list of one string"
             )
 
         label = SimpleLabel(Col(y))
+
         x = list(x) if isinstance(x, (list, set)) else [x]
+        if not all(isinstance(xi, str) for xi in x):
+            raise TypeError(
+                "Argument 'x' should be a string or a list of string"
+            )
+
         namespaces = Namespace(
             features=[Feature(value=Col(colname)) for colname in x]
         )

From 736a5699184f7a34f63d47a96beb8097b8abf5c7 Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Tue, 26 May 2020 16:16:43 +0200
Subject: [PATCH 17/18] change name of function process_label_and_value to
 process_label_and_tag

---
 python/vowpalwabbit/pyvw.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index a7f78a20dd0..6a736a3a352 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -2059,14 +2059,14 @@ def empty_col(self):
         """
         return pd.Series([""] * self.n_rows)
 
-    def process_label_and_value(self):
-        """Process the label and value into a unique column.
+    def process_label_and_tag(self):
+        """Process the label and tag into a unique column.
 
         Returns
         -------
 
         out : pandas.Series
-            A column where each row is the processed label and value.
+            A column where each row is the processed label and tag.
         """
         out = self.empty_col()
         if self.label is not None:
@@ -2105,7 +2105,7 @@ def process_df(self):
             The list of parsed lines in VW format.
         """
         if not all(x is None for x in [self.label, self.tag]):
-            self.out += self.process_label_and_value()
+            self.out += self.process_label_and_tag()
 
         for (num_ns, ns_obj) in enumerate(self.namespaces):
             to_add = ns_obj.process() + self.process_features(ns_obj.features)

From 883f2569250a8dc3d2a1025112ed02448a79df3c Mon Sep 17 00:00:00 2001
From: Etienne Kintzler <etienne.kintzler@gmail.com>
Date: Tue, 26 May 2020 18:17:42 +0200
Subject: [PATCH 18/18] fix anomaly when calling process_df multiple times

---
 python/vowpalwabbit/pyvw.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
index 6a736a3a352..439cd12c084 100644
--- a/python/vowpalwabbit/pyvw.py
+++ b/python/vowpalwabbit/pyvw.py
@@ -1880,7 +1880,6 @@ def __init__(self, df, namespaces, label=None, tag=None):
         self.check_namespaces_type()
         self.check_features_type()
         self.check_if_cols_exist()
-        self.out = self.empty_col()
 
     @classmethod
     def from_colnames(cls, y, x, df):
@@ -2104,6 +2103,8 @@ def process_df(self):
         list
             The list of parsed lines in VW format.
         """
+        self.out = self.empty_col()
+
         if not all(x is None for x in [self.label, self.tag]):
             self.out += self.process_label_and_tag()