-
Notifications
You must be signed in to change notification settings - Fork 98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding ignore_index to pandas_to_eland #154
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,6 +65,40 @@ def test_generate_es_mappings(self): | |
|
||
assert_pandas_eland_frame_equal(df, ed_df_head) | ||
|
||
ES_TEST_CLIENT.indices.delete(index=index_name) | ||
|
||
def test_pandas_to_eland_ignore_index(self): | ||
df = pd.DataFrame( | ||
data={ | ||
"A": np.random.rand(3), | ||
"B": 1, | ||
"C": "foo", | ||
"D": pd.Timestamp("20190102"), | ||
"E": [1.0, 2.0, 3.0], | ||
"F": False, | ||
"G": [1, 2, 3], | ||
}, | ||
index=["0", "1", "2"], | ||
) | ||
|
||
# Now create index | ||
index_name = "test_pandas_to_eland_ignore_index" | ||
|
||
ed_df = ed.pandas_to_eland( | ||
df, | ||
ES_TEST_CLIENT, | ||
index_name, | ||
es_if_exists="replace", | ||
es_refresh=True, | ||
ignore_index=True, | ||
) | ||
pd_df = ed.eland_to_pandas(ed_df) | ||
|
||
# Compare values excluding index | ||
assert df.values.all() == pd_df.values.all() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be |
||
|
||
ES_TEST_CLIENT.indices.delete(index=index_name) | ||
|
||
def test_eland_to_pandas_performance(self): | ||
# TODO quantify this | ||
ed.eland_to_pandas(self.ed_flights(), show_progress=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,7 @@ def pandas_to_eland( | |
es_dropna=False, | ||
es_geo_points=None, | ||
chunksize=None, | ||
ignore_index=False, | ||
): | ||
""" | ||
Append a pandas DataFrame to an Elasticsearch index. | ||
|
@@ -86,7 +87,10 @@ def pandas_to_eland( | |
es_geo_points: list, default None | ||
List of columns to map to geo_point data type | ||
chunksize: int, default None | ||
number of pandas.DataFrame rows to read before bulk index into Elasticsearch | ||
Number of pandas.DataFrame rows to read before bulk index into Elasticsearch | ||
ignore_index: bool, default 'False' | ||
Ignore pandas.DataFrame.index when indexing into Elasticsearch? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Drop the ?, document the behavior for the true and false case |
||
If 'False' pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields. | ||
|
||
Returns | ||
------- | ||
|
@@ -185,16 +189,19 @@ def pandas_to_eland( | |
actions = [] | ||
n = 0 | ||
for row in pd_df.iterrows(): | ||
# Use index as _id | ||
id = row[0] | ||
|
||
if es_dropna: | ||
values = row[1].dropna().to_dict() | ||
else: | ||
values = row[1].to_dict() | ||
|
||
# Use integer as id field for repeatable results | ||
action = {"_index": es_dest_index, "_source": values, "_id": str(id)} | ||
if ignore_index: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should change the name of this parameter to be more verbose but also clear on what "index" is being ignored since the concept appears in both pandas and ES. Maybe |
||
action = {"_index": es_dest_index, "_source": values} | ||
else: | ||
# Use index as _id | ||
id = row[0] | ||
|
||
# Use integer as id field for repeatable results | ||
action = {"_index": es_dest_index, "_source": values, "_id": str(id)} | ||
|
||
actions.append(action) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This unit test passes without the
ignore_index=True
, should add a that fails if not using the functionality.Maybe adding this assert is enough: