Skip to content

Commit

Permalink
Merge pull request #1844 from VesnaT/transpose_guess_type
Browse files Browse the repository at this point in the history
[ENH] Table.transpose: Use heuristic to guess data type of attributes of attributes
  • Loading branch information
lanzagar authored Jan 4, 2017
2 parents e380ca2 + ff2cdaa commit aa1ab0b
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 73 deletions.
145 changes: 86 additions & 59 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,80 @@ def _from_file(f):
return _from_file(filename)


def guess_data_type(orig_values):
"""
Use heuristics to guess data type.
"""
valuemap, values = [], orig_values
is_discrete = is_discrete_values(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
coltype = DiscreteVariable
else:
try:
values = [float(i) for i in orig_values]
except ValueError:
tvar = TimeVariable('_')
try:
values = [tvar.parse(i) for i in orig_values]
except ValueError:
coltype = StringVariable
else:
coltype = TimeVariable
else:
coltype = ContinuousVariable
return valuemap, values, coltype


def sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data=None):
if valuemap:
# Map discrete data to ints
def valuemap_index(val):
try:
return valuemap.index(val)
except ValueError:
return np.nan

values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
coltype_kwargs.update(values=valuemap)

if coltype is StringVariable:
values = ['' if i is np.nan else i for i in orig_values]

var = None
if domain_vars is not None:
if existing_var:
# Use existing variable if available
var = coltype.make(existing_var.strip(), **coltype_kwargs)
else:
# Never use existing for un-named variables
var = coltype(new_var_name, **coltype_kwargs)

# Reorder discrete values to match existing variable
if var.is_discrete and not var.ordered:
new_order, old_order = var.values, coltype_kwargs.get('values',
var.values)
if new_order != old_order:
offset = len(new_order)
column = values if data.ndim > 1 else data
column += offset
for i, val in enumerate(var.values):
try:
oldval = old_order.index(val)
except ValueError:
continue
bn.replace(column, offset + oldval, new_order.index(val))

if isinstance(var, TimeVariable) or coltype is TimeVariable:
# Re-parse the values because only now after coltype.make call
# above, variable var is the correct one
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
values = [_var.parse(i) for i in orig_values]

return values, var


class Flags:
"""Parser for column flags (i.e. third header row)"""
DELIMITER = ' '
Expand Down Expand Up @@ -522,6 +596,7 @@ def _equal_length(lst):

elif (type_flag in DiscreteVariable.TYPE_HEADERS or
_RE_DISCRETE_LIST.match(type_flag)):
coltype = DiscreteVariable
if _RE_DISCRETE_LIST.match(type_flag):
valuemap = Flags.split(type_flag)
coltype_kwargs.update(ordered=True)
Expand All @@ -530,38 +605,7 @@ def _equal_length(lst):

else:
# No known type specified, use heuristics
is_discrete = is_discrete_values(orig_values)
if is_discrete:
valuemap = sorted(is_discrete)
else:
try:
values = [float(i) for i in orig_values]
except ValueError:
tvar = TimeVariable('_')
try:
values = [tvar.parse(i) for i in orig_values]
except ValueError:
coltype = StringVariable
else:
coltype = TimeVariable
else:
coltype = ContinuousVariable

if valuemap:
# Map discrete data to ints
def valuemap_index(val):
try:
return valuemap.index(val)
except ValueError:
return np.nan

values = np.vectorize(valuemap_index, otypes=[float])(orig_values)
coltype = DiscreteVariable
coltype_kwargs.update(values=valuemap)

if coltype is StringVariable:
values = ['' if i is np.nan else i
for i in orig_values]
valuemap, values, coltype = guess_data_type(orig_values)

if flag.m or coltype is StringVariable:
append_to = (Mcols, metas)
Expand All @@ -574,37 +618,20 @@ def valuemap_index(val):

cols, domain_vars = append_to
cols.append(col)
var = None

existing_var, new_var_name, column = None, None, None
if domain_vars is not None:
existing_var = names and names[col]
if not existing_var:
new_var_name = next(NAMEGEN)

values, var = sanitize_variable(
valuemap, values, orig_values, coltype, coltype_kwargs,
domain_vars, existing_var, new_var_name, data)
if domain_vars is not None:
if names and names[col]:
# Use existing variable if available
var = coltype.make(names[col].strip(), **coltype_kwargs)
else:
# Never use existing for un-named variables
var = coltype(next(NAMEGEN), **coltype_kwargs)
var.attributes.update(flag.attributes)
domain_vars.append(var)

# Reorder discrete values to match existing variable
if var.is_discrete and not var.ordered:
new_order, old_order = var.values, coltype_kwargs.get('values', var.values)
if new_order != old_order:
offset = len(new_order)
column = values if data.ndim > 1 else data
column += offset
for i, val in enumerate(var.values):
try:
oldval = old_order.index(val)
except ValueError:
continue
bn.replace(column, offset + oldval, new_order.index(val))

if isinstance(var, TimeVariable) or coltype is TimeVariable:
# Re-parse the values because only now after coltype.make call
# above, variable var is the correct one
_var = var if isinstance(var, TimeVariable) else TimeVariable('_')
values = [_var.parse(i) for i in orig_values]

# Write back the changed data. This is needeed to pass the
# correct, converted values into Table.from_numpy below
try:
Expand Down
14 changes: 14 additions & 0 deletions Orange/data/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import bottleneck as bn
from scipy import sparse as sp

import Orange.data # import for io.py
from Orange.data import (
_contingency, _valuecount,
Domain, Variable, Storage, StringVariable, Unknown, Value, Instance,
Expand Down Expand Up @@ -1453,6 +1454,7 @@ def transpose(cls, table, feature_names_column="",
feature names are mapped
:return: Table - transposed table
"""

self = cls()
n_cols, self.n_rows = table.X.shape
old_domain = table.attributes.get("old_domain")
Expand Down Expand Up @@ -1522,10 +1524,22 @@ def get_table_from_attributes_of_attributes(_vars, _dtype=float):
names = chain.from_iterable(list(attr.attributes)
for attr in table.domain.attributes)
names = sorted(set(names) - {var.name for var in class_vars})

def guessed_var(i, var_name):
orig_vals = M[:, i]
val_map, vals, var_type = Orange.data.io.guess_data_type(orig_vals)
values, variable = Orange.data.io.sanitize_variable(
val_map, vals, orig_vals, var_type,
{}, _metas, None, var_name)
M[:, i] = values
return variable

_metas = [StringVariable(n) for n in names]
if old_domain:
_metas = [m for m in old_domain.metas if m.name != meta_attr_name]
M = get_table_from_attributes_of_attributes(_metas, _dtype=object)
if not old_domain:
_metas = [guessed_var(i, m.name) for i, m in enumerate(_metas)]
if _metas:
self.metas = np.hstack((self.metas, M))
metas.extend(_metas)
Expand Down
59 changes: 45 additions & 14 deletions Orange/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2342,7 +2342,7 @@ def test_transpose_continuous_metas(self):
metas = [ContinuousVariable("m1")]
domain = Domain(attrs, metas=metas)
X = np.arange(8).reshape((4, 2))
M = np.array([0, 1, 0, 1])[:, None]
M = np.array([0.0, 1.0, 0.0, 1.0])[:, None]
data = Table(domain, X, metas=M)

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
Expand Down Expand Up @@ -2449,7 +2449,7 @@ def test_transpose_class_and_metas(self):
# original should not change
self.assertDictEqual(data.domain.attributes[0].attributes, {})

def test_transpose_attributes_of_attributes(self):
def test_transpose_attributes_of_attributes_discrete(self):
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
attrs[0].attributes = {"attr1": "a", "attr2": "aa"}
attrs[1].attributes = {"attr1": "b", "attr2": "bb"}
Expand All @@ -2458,11 +2458,12 @@ def test_transpose_attributes_of_attributes(self):

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
metas = [StringVariable("Feature name"), StringVariable("attr1"),
StringVariable("attr2")]
metas = [StringVariable("Feature name"),
DiscreteVariable("attr1", values=("a", "b")),
DiscreteVariable("attr2", values=("aa", "bb"))]
domain = Domain(att, metas=metas)
result = Table(domain, np.arange(8).reshape((4, 2)).T,
metas=np.array([["c1", "a", "aa"], ["c2", "b", "bb"]]))
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, 1.0]], dtype=object)
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)

# transpose and compare
self._compare_tables(result, Table.transpose(data))
Expand All @@ -2475,6 +2476,33 @@ def test_transpose_attributes_of_attributes(self):
self.assertDictEqual(data.domain.attributes[0].attributes,
{"attr1": "a", "attr2": "aa"})

def test_transpose_attributes_of_attributes_continuous(self):
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
attrs[0].attributes = {"attr1": "1.100", "attr2": "1.300"}
attrs[1].attributes = {"attr1": "2.200", "attr2": "2.300"}
domain = Domain(attrs)
data = Table(domain, np.arange(8).reshape((4, 2)))

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
metas = [StringVariable("Feature name"), ContinuousVariable("attr1"),
ContinuousVariable("attr2")]
domain = Domain(att, metas=metas)
result = Table(domain, np.arange(8).reshape((4, 2)).T,
metas=np.array([["c1", 1.1, 1.3],
["c2", 2.2, 2.3]], dtype=object))

# transpose and compare
self._compare_tables(result, Table.transpose(data))

# transpose of transpose
t = Table.transpose(Table.transpose(data), "Feature name")
self._compare_tables(data, t)

# original should not change
self.assertDictEqual(data.domain.attributes[0].attributes,
{"attr1": "1.100", "attr2": "1.300"})

def test_transpose_attributes_of_attributes_missings(self):
attrs = [ContinuousVariable("c1"), ContinuousVariable("c2")]
attrs[0].attributes = {"attr1": "a", "attr2": "aa"}
Expand All @@ -2484,11 +2512,12 @@ def test_transpose_attributes_of_attributes_missings(self):

att = [ContinuousVariable("Feature 1"), ContinuousVariable("Feature 2"),
ContinuousVariable("Feature 3"), ContinuousVariable("Feature 4")]
metas = [StringVariable("Feature name"), StringVariable("attr1"),
StringVariable("attr2")]
metas = [StringVariable("Feature name"),
DiscreteVariable("attr1", values=("a", "b")),
DiscreteVariable("attr2", values=("aa",))]
domain = Domain(att, metas=metas)
result = Table(domain, np.arange(8).reshape((4, 2)).T,
metas=np.array([["c1", "a", "aa"], ["c2", "b", ""]]))
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, np.nan]], dtype=object)
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)

# transpose and compare
self._compare_tables(result, Table.transpose(data))
Expand Down Expand Up @@ -2517,10 +2546,11 @@ def test_transpose_class_metas_attributes(self):
att[1].attributes = {"cls": "2.000", "m1": "bb", "m2": "bbb"}
att[2].attributes = {"cls": "3.000", "m1": "cc", "m2": "ccc"}
att[3].attributes = {"cls": "4.000", "m1": "dd", "m2": "ddd"}
metas = [StringVariable("Feature name"), StringVariable("attr1"),
StringVariable("attr2")]
metas = [StringVariable("Feature name"),
DiscreteVariable("attr1", values=("a1", "b1")),
DiscreteVariable("attr2", values=("aa1", "bb1"))]
domain = Domain(att, metas=metas)
M = np.array([["c1", "a1", "aa1"], ["c2", "b1", "bb1"]])
M = np.array([["c1", 0.0, 0.0], ["c2", 1.0, 1.0]], dtype=object)
result = Table(domain, np.arange(8).reshape((4, 2)).T, metas=M)

# transpose and compare
Expand All @@ -2546,7 +2576,8 @@ def _compare_tables(self, table1, table2):
self.assertEqual(table1.n_rows, table2.n_rows)
np.testing.assert_array_equal(table1.X, table2.X)
np.testing.assert_array_equal(table1.Y, table2.Y)
np.testing.assert_array_equal(table1.metas, table2.metas)
np.testing.assert_array_equal(table1.metas.astype(str),
table2.metas.astype(str))
np.testing.assert_array_equal(table1.W, table2.W)

self.assertEqual([(type(x), x.name, x.attributes)
Expand Down

0 comments on commit aa1ab0b

Please sign in to comment.