VowpalWabbit · jackgerrits · May 27, 2020 · May 1, 2020 · May 1, 2020 · May 1, 2020
diff --git a/python/tests/test_pyvw.py b/python/tests/test_pyvw.py
@@ -2,7 +2,9 @@
 
 from vowpalwabbit import pyvw
 from vowpalwabbit.pyvw import vw
+from vowpalwabbit.pyvw import DataFrameToVW
 import pytest
+import pandas as pd
 
 BIT_SIZE = 18
 
@@ -344,3 +346,49 @@ def check_error_raises(type, argument):
     """
     with pytest.raises(type) as error:
         argument()
+
+
+def test_oneline_simple_conversion():
+    df = pd.DataFrame({"y": [1], "x": [2]})
+    conv = DataFrameToVW(df, "{y} | {x}")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "1 | 2"
+
+
+def test_oneline_with_column_renaming_and_tag():
+    df = pd.DataFrame({"idx": ["id_1"], "y": [1], "x": [2]})
+    conv = DataFrameToVW(df, "{y} {idx}| col_x:{x}")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "1 id_1| col_x:2"
+
+
+def test_multiple_lines_conversion():
+    df = pd.DataFrame({"y": [1, -1], "x": [1, 2]})
+    conv = DataFrameToVW(df, "{y} | {x}")
+    lines_list = conv.process_df()
+    assert lines_list == ["1 | 1", "-1 | 2"]
+
+
+def test_oneline_with_multiple_namespaces():
+    df = pd.DataFrame({"y": [1], "a": [2], "b": [3]})
+    conv = DataFrameToVW(df, "{y} |FirstNameSpace {a} |DoubleIt:2 {b}")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "1 |FirstNameSpace 2 |DoubleIt:2 3"
+
+
+def test_oneline_without_target():
+    df = pd.DataFrame({"a": [2], "b": [3]})
+    conv = DataFrameToVW(df, "| {a} {b}")
+    lines_list = conv.process_df()
+    first_line = lines_list[0]
+    assert first_line == "| 2 3"
+
+
+def test_absent_col_error():
+    with pytest.raises(ValueError) as value_error:
+        df = pd.DataFrame({"a": [1]})
+        conv = DataFrameToVW(df, "{a} | {b} {c}")
+    assert "Column(s) 'b', 'c' not in the DataFrame" == str(value_error.value)
diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py
@@ -4,6 +4,8 @@
 from __future__ import division
 import pylibvw
 import warnings
+import pandas as pd
+import re
 
 class SearchTask():
     """Search task class"""
@@ -1354,3 +1356,123 @@ def get_label(self, label_class=simple_label):
             simple_label
         """
         return label_class(self)
+
+
+class DataFrameToVW:
+    """DataFrameToVW class"""
+
+    re_parse_col = re.compile(pattern="{([^{}]*)}")
+
+    feature_name_pattern = "(?:\w+[:*])"
+    feature_value_pattern = "{[^{}]+}"
+    const_value_pattern = "[\w.]+"
+    re_check_formula = re.compile(
+        "(?:\s*\|?\s*{}?(?:{}|{})\s*)*".format(
+            feature_name_pattern, feature_value_pattern, const_value_pattern
+        )
+    )
+
+    def __init__(self, df, formula):
+        """
+        Convert a pandas DataFrame to the vowpal wabbit format defined by the user in formula parameter.
+        Formula is a string where the feature value of a given column is specified using 
+        the curly braces syntax (e.g: {name_of_the_column}). The part of the formula not specified
+        in curly braces will be considered constant and repeated on each line. See examples 
+        for more details.
+
+        The following column names cannot be used in the formula :
+            - column names that contain the character '{' or '}' 
+            - the empty string ''
+
+        Parameters
+        ----------
+        df : pandas.DataFrame
+            The DataFrame to convert
+        formula : str
+            The formula specifying the desired vowpal wabbit input format. 
+
+        Examples
+        --------
+
+        >>> from vowpalwabbit import DataFrameToVW
+        >>> from pandas as pd
+        >>> df = pd.DataFrame({"y": [0], "x": [1]})
+        >>> conv = DataFrameToVW(df, "{y} | {x}")
+        >>> vw_lines = conv.process_df()
+
+        >>> df2 = pd.DataFrame({"y": [0], "x": [1], "z": [2]})
+        >>> conv2 = DataFrameToVW(df2, '{y} |AllFeatures {x} {z}')
+        >>> vw_lines2 = conv2.process_df()
+
+        Returns
+        -------
+        self: DataFrameToVW
+        """
+        self.df = df
+        self.n_rows = df.shape[0]
+        self.column_names = set(df.columns)
+        self.formula = re.sub("\s+", " ", formula).strip()
+        self.check_formula()
+        self.check_absent_cols()
+
+    def check_formula(self):
+        """
+        Check if formula is of appropriate format
+        """
+        match = self.re_check_formula.match(self.formula)
+        valid_formula = match.group() == self.formula
+        if not valid_formula:
+            valid_part = self.formula[: match.end()]
+            invalid_part = self.formula[match.end() :]
+            raise ValueError(
+                "Error parsing formula.\nValid: '{}'\nNot valid: '{}'".format(
+                    valid_part, invalid_part
+                )
+            )
+
+    def check_absent_cols(self):
+        """
+        Helper function that check if any of the column specified in the formula is missing.
+        The function raises value error if any of the column is absent.
+
+        Raises
+        ------
+        ValueError
+            If the column specified in the formula does not exist in the dataframe
+
+        """
+
+        all_cols = self.re_parse_col.findall(self.formula)
+        absent_cols = [col for col in all_cols if col not in self.column_names]
+        if any(absent_cols):
+            absent_cols_str = str(absent_cols)[1:-1]
+            raise ValueError(
+                "Column(s) {} not in the DataFrame".format(absent_cols_str)
+            )
+
+    def process_df(self):
+        """
+        Convert pandas.DataFrame to a suitable vowpal wabbit input format
+
+        Returns
+        -------
+        out
+            The list of the lines of the DataFrame in vowpal wabbit input format
+
+        """
+        matches = list(self.re_parse_col.finditer(self.formula))
+        out = pd.Series([""] * self.n_rows)
+
+        current_pos = 0
+        for match in matches:
+            col_name = match.group()[1:-1]
+            start_pos, end_pos = match.span()
+            str_part = self.formula[current_pos:start_pos]
+            value_part = self.df[col_name].apply(str)
+            out += str_part + value_part
+            current_pos = end_pos
+        out += self.formula[current_pos : len(self.formula)]
+
+        return out.to_list()
+
+