Support Google Docs, Spreadsheets, and Slides with export format

wkentaro · Mar 24, 2023 · 23a4477 · 23a4477
1 parent 665db95
commit 23a4477
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 1 deletion.
diff --git a/gdown/cli.py b/gdown/cli.py
@@ -108,6 +108,11 @@ def main():
         help="(folder only) asserts that is ok to download max "
         "{max} files per folder.".format(max=MAX_NUMBER_FILES),
     )
+    parser.add_argument(
+        "--format",
+        help="Format of Google Docs, Spreadsheets and Slides. "
+        "Default is Google Docs: 'docx', Spreadsheet: 'xlsx', Slides: 'pptx'.",
+    )
 
     args = parser.parse_args()
 
@@ -159,6 +164,7 @@ def main():
             id=id,
             fuzzy=args.fuzzy,
             resume=args.continue_,
+            format=args.format,
         )
         success = filename is not None
 

diff --git a/gdown/download.py b/gdown/download.py
@@ -97,6 +97,7 @@ def download(
     id=None,
     fuzzy=False,
     resume=False,
+    format=None,
 ):
     """Download file from URL.
 
@@ -125,6 +126,11 @@ def download(
     resume: bool
         Resume the download from existing tmp file if possible.
         Default is False.
+    format: str, optional
+        Format of Google Docs, Spreadsheets and Slides. Default is:
+            - Google Docs: 'docx'
+            - Google Spreadsheet: 'xlsx'
+            - Google Slides: 'pptx'
 
     Returns
     -------
@@ -162,6 +168,45 @@ def download(
             print(e, file=sys.stderr)
             return
 
+        if url == url_origin and res.status_code == 500:
+            # The file could be Google Docs or Spreadsheets.
+            url = "https://drive.google.com/open?id={id}".format(
+                id=gdrive_file_id
+            )
+            continue
+        m = re.search("<title>(.+)</title>", res.text)
+        if m and m.groups()[0].endswith(" - Google Docs"):
+            url = (
+                "https://docs.google.com/document/d/{id}/export"
+                "?format={format}".format(
+                    id=gdrive_file_id,
+                    format="docx" if format is None else format,
+                )
+            )
+            continue
+        elif m and m.groups()[0].endswith(" - Google Sheets"):
+            url = (
+                "https://docs.google.com/spreadsheets/d/{id}/export"
+                "?format={format}".format(
+                    id=gdrive_file_id,
+                    format="xlsx" if format is None else format,
+                )
+            )
+            continue
+        elif (m and m.groups()[0].endswith(" - Google Slides")) or (
+            "Content-Disposition" in res.headers
+            and res.headers["Content-Disposition"].endswith("pptx")
+            and format not in {None, "pptx"}
+        ):
+            url = (
+                "https://docs.google.com/presentation/d/{id}/export"
+                "?format={format}".format(
+                    id=gdrive_file_id,
+                    format="pptx" if format is None else format,
+                )
+            )
+            continue
+
         if use_cookies:
             if not osp.exists(osp.dirname(cookies_file)):
                 os.makedirs(osp.dirname(cookies_file))

diff --git a/gdown/parse_url.py b/gdown/parse_url.py
@@ -24,7 +24,12 @@ def parse_url(url, warning=True):
         if len(file_ids) == 1:
             file_id = file_ids[0]
     else:
-        patterns = [r"^/file/d/(.*?)/view$", r"^/presentation/d/(.*?)/edit$"]
+        patterns = [
+            r"^/file/d/(.*?)/view$",
+            r"^/document/d/(.*?)/(view|edit)$",
+            r"^/presentation/d/(.*?)/(view|edit)$",
+            r"^/spreadsheets/u/0/d/(.*?)/(htmlview|edit)$",
+        ]
         for pattern in patterns:
             match = re.match(pattern, parsed.path)
             if match: