Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Maintain Rows and Columns indices order #28

Merged
merged 4 commits into from
Jul 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions ExtractTable/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,22 @@ def trigger_process(self, fp: BinaryIO, dup_check: bool = False, **kwargs) -> di

# GetResult if JobId is present in the response
# Usually happens when processing PDF files or idempotent requests
if 'JobId' in resp:
if 'JobId' in resp and resp.get("JobStatus", "") == JobStatus.PROCESSING:
if max_wait_time > 0:
print("[Info]: Waiting to retrieve the output; JobId:", resp['JobId'])
else:
print("[Info]: JobId:", resp['JobId'])
resp = self.get_result(resp['JobId'], max_wait_time=max_wait_time)

return resp

def bigfile_upload(self, filename):
resp = self._make_request('post', HOST.BIGFILE, data={"filename": filename})
def bigfile_upload(self, filepath):
"""
To aid big file processing by uploading the file first and triggering the process next
:param filepath: filepath
:return: a signed URL to upload the file
"""
resp = self._make_request('post', HOST.BIGFILE, data={"filename": filepath})

return resp

Expand All @@ -132,7 +141,7 @@ def process_file(
Example: '1,3,4' or '1,4-end' or 'all'.
:param output_format: datafram as default; Check `ExtractTable._OUTPUT_FORMATS` to see available options
:param dup_check: Idempotent requests handler
:param indexing: If row index is needed
:param indexing: Whether to output row & column indices in the outputs other than df
:param kwargs:
max_wait_time: int, optional (default: 300);
Maximum Time to wait before returning to the client
Expand All @@ -151,13 +160,13 @@ def process_file(
with open(infile.filepath, 'rb') as fp:
trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
except ClientFileSizeError:
big_gen = self.bigfile_upload(filename=os.path.basename(filepath))
big_gen = self.bigfile_upload(filepath=os.path.basename(filepath))
with open(filepath, 'rb') as ifile:
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
trigger_resp = self.trigger_process(None, signed_filename=os.path.basename(filepath), **kwargs)

for _type, _obj in trigger_resp.items():
self.__setattr__(_type, _obj)

result = ConvertTo(data=trigger_resp, fmt=output_format, index=indexing).output
result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
return result
2 changes: 1 addition & 1 deletion ExtractTable/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = (2, 0, 0)
VERSION = (2, 0, 1)
PRERELEASE = None # "alpha", "beta" or "rc"
REVISION = None

Expand Down
26 changes: 16 additions & 10 deletions ExtractTable/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import tempfile
import warnings
import collections

import pandas as pd

Expand All @@ -13,27 +14,32 @@ class ConvertTo:
FORMATS = {"df", "dataframe", "json", "csv", "dict"}
DEFAULT = "df"

def __init__(self, data: dict, fmt: str = DEFAULT, index: bool = False):
def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
"""

:param data: Tabular JSON data from server
:param fmt: format to be converted into
:param index: row index consideration in the output
:param indexing: row & column index consideration in the output
"""
self.data = data
self.output = self._converter(fmt.lower(), index=index)
self.output = self._converter(fmt.lower(), indexing=indexing)

def _converter(self, fmt: str, index: bool = False) -> list:
def _converter(self, fmt: str, indexing: bool = False) -> list:
"""
Actual conversion takes place here using Pandas
:param fmt: format to be converted into
:param index: row index consideration in the output
:param indexing: row index consideration in the output
:return: list of tables from converted into the requested output format
"""
# To convert the column indices to int to maintain the correct order on a table with more than 9 columns
dfs = [pd.DataFrame.from_dict(
{int(k): v for k, v in table["TableJson"].items()}, orient="index"
) for table in self.data.get("Tables", [])]
dfs = []
for table in self.data.get("Tables", []):
tmp = {int(k): v for k, v in table["TableJson"].items()}
# To convert column indices to int to maintain the table order with more than 9 columns
cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
# To convert row indices to int and maintain the table order with more than 9 rows
tmp = collections.OrderedDict(sorted(tmp.items()))
dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))

if fmt in ("df", "dataframe"):
return dfs
elif fmt == "dict":
Expand All @@ -43,7 +49,7 @@ def _converter(self, fmt: str, index: bool = False) -> list:
output_location = []
for tbl_n, df in enumerate(dfs):
csv_name = os.path.join(save_folder, f"_table_{tbl_n+1}.csv")
df.to_csv(csv_name, index=index)
df.to_csv(csv_name, index=indexing, header=indexing)
output_location.append(csv_name)
return output_location
elif fmt == "json":
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ Its up to you now to explore the ways.

# Explore
check the complete server response of the latest job with `et_sess.ServerResponse.json()`
```json
```javascript
{
"JobStatus": <string>, # Status of the triggered Process @ JOB-LEVEL
"Pages": <integer>, # Number of pages processed in this request @ PAGE-LEVEL
Expand Down
2 changes: 1 addition & 1 deletion example-code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@
"source": [
"> **Understand the output**: The response of a triggered job is a JSON object in the below format. Note that the response depends on the plan type of the API Key.\n",
"\n",
"```json\n",
"```javascript\n",
"{\n",
" \"JobStatus\": <string>, # Status of the triggered Process @ JOB-LEVEL\n",
" \"Pages\": <integer>, # Number of pages processed in this request @ PAGE-LEVEL\n",
Expand Down