Skip to content

Commit

Permalink
Support Google Docs, Spreadsheets, and Slides with export format
Browse files Browse the repository at this point in the history
  • Loading branch information
wkentaro committed Mar 24, 2023
1 parent 665db95 commit 23a4477
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 1 deletion.
6 changes: 6 additions & 0 deletions gdown/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ def main():
help="(folder only) asserts that is ok to download max "
"{max} files per folder.".format(max=MAX_NUMBER_FILES),
)
parser.add_argument(
"--format",
help="Format of Google Docs, Spreadsheets and Slides. "
"Default is Google Docs: 'docx', Spreadsheet: 'xlsx', Slides: 'pptx'.",
)

args = parser.parse_args()

Expand Down Expand Up @@ -159,6 +164,7 @@ def main():
id=id,
fuzzy=args.fuzzy,
resume=args.continue_,
format=args.format,
)
success = filename is not None

Expand Down
45 changes: 45 additions & 0 deletions gdown/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def download(
id=None,
fuzzy=False,
resume=False,
format=None,
):
"""Download file from URL.
Expand Down Expand Up @@ -125,6 +126,11 @@ def download(
resume: bool
Resume the download from existing tmp file if possible.
Default is False.
format: str, optional
Format of Google Docs, Spreadsheets and Slides. Default is:
- Google Docs: 'docx'
- Google Spreadsheet: 'xlsx'
- Google Slides: 'pptx'
Returns
-------
Expand Down Expand Up @@ -162,6 +168,45 @@ def download(
print(e, file=sys.stderr)
return

if url == url_origin and res.status_code == 500:
# The file could be Google Docs or Spreadsheets.
url = "https://drive.google.com/open?id={id}".format(
id=gdrive_file_id
)
continue
m = re.search("<title>(.+)</title>", res.text)
if m and m.groups()[0].endswith(" - Google Docs"):
url = (
"https://docs.google.com/document/d/{id}/export"
"?format={format}".format(
id=gdrive_file_id,
format="docx" if format is None else format,
)
)
continue
elif m and m.groups()[0].endswith(" - Google Sheets"):
url = (
"https://docs.google.com/spreadsheets/d/{id}/export"
"?format={format}".format(
id=gdrive_file_id,
format="xlsx" if format is None else format,
)
)
continue
elif (m and m.groups()[0].endswith(" - Google Slides")) or (
"Content-Disposition" in res.headers
and res.headers["Content-Disposition"].endswith("pptx")
and format not in {None, "pptx"}
):
url = (
"https://docs.google.com/presentation/d/{id}/export"
"?format={format}".format(
id=gdrive_file_id,
format="pptx" if format is None else format,
)
)
continue

if use_cookies:
if not osp.exists(osp.dirname(cookies_file)):
os.makedirs(osp.dirname(cookies_file))
Expand Down
7 changes: 6 additions & 1 deletion gdown/parse_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ def parse_url(url, warning=True):
if len(file_ids) == 1:
file_id = file_ids[0]
else:
patterns = [r"^/file/d/(.*?)/view$", r"^/presentation/d/(.*?)/edit$"]
patterns = [
r"^/file/d/(.*?)/view$",
r"^/document/d/(.*?)/(view|edit)$",
r"^/presentation/d/(.*?)/(view|edit)$",
r"^/spreadsheets/u/0/d/(.*?)/(htmlview|edit)$",
]
for pattern in patterns:
match = re.match(pattern, parsed.path)
if match:
Expand Down

0 comments on commit 23a4477

Please sign in to comment.