Complete anchors/headings with data from Webref (#2426)

speced · Dec 19, 2022 · eacff10 · eacff10
1 parent 6bd0f75
commit eacff10
Showing 1 changed file with 217 additions and 19 deletions.
diff --git a/bikeshed/update/updateCrossRefs.py b/bikeshed/update/updateCrossRefs.py
@@ -7,6 +7,7 @@
 
 import certifi
 import tenacity
+import requests
 from json_home_client import Client as APIClient
 
 from .. import config, messages as m, t
@@ -45,6 +46,32 @@ class HeadingT(t.TypedDict):
         total=False,
     )
 
+    RawWebrefAnchorT = t.TypedDict(
+        "RawWebrefAnchorT",
+        {
+            "id": t.Required[str],
+            "type": t.Required[str],
+            "for": list[str],
+            "informative": bool,
+            "access": t.Required[str],
+            "linkingText": list[str],
+            "href": t.Required[str],
+        },
+        total=False,
+    )
+
+    RawWebrefHeadingT = t.TypedDict(
+        "RawWebrefHeadingT",
+        {
+            "id": t.Required[str],
+            "href": t.Required[str],
+            "title": t.Required[str],
+            "level": t.Required[int],
+            "number": t.Required[str],
+        },
+        total=False,
+    )
+
     # Need to use function form due to "for" key
     # being invalid as a property name
     AnchorT = t.TypedDict(
@@ -72,6 +99,16 @@ class RawSpecT(t.TypedDict, total=False):
         draft_uri: str | None
         abstract: str | None
 
+    class RawWebrefSpecT(t.TypedDict, total=False):
+        url: t.Required[str]
+        shortname: t.Required[str]
+        title: t.Required[str]
+        shortTitle: t.Required[str]
+        series: t.Required(dict[str, str])
+        nightly: t.Required(dict[str, str | list(str)])
+        release: dict[str, str | list(str)] | None
+        seriesVersion: str | None
+
     class SpecT(t.TypedDict):
         vshortname: str
         shortname: str
@@ -88,7 +125,38 @@ def progressMessager(index: int, total: int) -> t.Callable[[], None]:
 
 
 def update(path: str, dryRun: bool = False) -> set[str] | None:
-    m.say("Downloading anchor data...")
+    def setStatus(obj: RawAnchorT, status: str) -> RawAnchorT:
+        obj["status"] = status
+        return obj
+
+    def isSpecInList(spec: SpecT, specs: SpecsT) -> bool:
+        if spec["vshortname"].lower() in specs:
+            return True
+        for s in specs.values():
+            # Shortnames don't always match. Let's compare URLs
+            if (s["snapshot_url"] is not None and s["snapshot_url"] == spec["snapshot_url"]) or (
+                s["current_url"] is not None and s["current_url"] == spec["current_url"]
+            ):
+                return True
+        return False
+
+    def processRawAnchors(rawAnchorData: list(RawAnchorT), anchors: AnchorsT, specHeadings: AllHeadingsT) -> None:
+        for rawAnchor in rawAnchorData:
+            rawAnchor = fixupAnchor(rawAnchor)
+            linkingTexts = rawAnchor["linking_text"]
+            assert linkingTexts is not None
+            if len(linkingTexts) == 0:
+                # Happens if it had no linking text at all originally
+                continue
+            if len(linkingTexts) == 1 and linkingTexts[0].strip() == "":
+                # Happens if it was marked with an empty lt and Shepherd still picked it up
+                continue
+            if "section" in rawAnchor and rawAnchor["section"] is True:
+                addToHeadings(rawAnchor, specHeadings, spec=spec)
+            if rawAnchor["type"] in config.dfnTypes.union(["dfn"]):
+                addToAnchors(rawAnchor, anchors, spec=spec)
+
+    m.say("Downloading anchor data from Shepherd...")
     shepherd = APIClient(
         "https://api.csswg.org/shepherd/",
         version="vnd.csswg.shepherd.v1",
@@ -115,27 +183,78 @@ def update(path: str, dryRun: bool = False) -> set[str] | None:
         specHeadings: HeadingsT = {}
         headings[spec["vshortname"]] = specHeadings
 
-        def setStatus(obj: RawAnchorT, status: str) -> RawAnchorT:
-            obj["status"] = status
-            return obj
-
         rawAnchorData = [setStatus(x, "snapshot") for x in linearizeAnchorTree(rawSpec.get("anchors", []))] + [
             setStatus(x, "current") for x in linearizeAnchorTree(rawSpec.get("draft_anchors", []))
         ]
-        for rawAnchor in rawAnchorData:
-            rawAnchor = fixupAnchor(rawAnchor)
-            linkingTexts = rawAnchor["linking_text"]
-            assert linkingTexts is not None
-            if len(linkingTexts) == 0:
-                # Happens if it had no linking text at all originally
-                continue
-            if len(linkingTexts) == 1 and linkingTexts[0].strip() == "":
-                # Happens if it was marked with an empty lt and Shepherd still picked it up
-                continue
-            if "section" in rawAnchor and rawAnchor["section"] is True:
-                addToHeadings(rawAnchor, specHeadings, spec=spec)
-            if rawAnchor["type"] in config.dfnTypes.union(["dfn"]):
-                addToAnchors(rawAnchor, anchors, spec=spec)
+        processRawAnchors(rawAnchorData, anchors, specHeadings)
+
+    m.say("Downloading anchor data from Webref...")
+    webrefAPIUrl = "https://raw.githubusercontent.com/w3c/webref/main/"
+    currentWebrefData = dataFromWebref(webrefAPIUrl + "ed/index.json")
+    if not currentWebrefData or "results" not in currentWebrefData or currentWebrefData["results"] is None:
+        return None
+    currentWebrefData = currentWebrefData["results"]
+    snapshotWebrefData = dataFromWebref(webrefAPIUrl + "tr/index.json")
+    snapshotWebrefData = (
+        snapshotWebrefData["results"]
+        if snapshotWebrefData and "results" in snapshotWebrefData and snapshotWebrefData["results"] is not None
+        else None
+    )
+
+    for i, currentSpec in enumerate(currentWebrefData, 1):
+        lastMsgTime = config.doEvery(
+            s=5,
+            lastTime=lastMsgTime,
+            action=progressMessager(i, len(currentWebrefData)),
+        )
+        if "dfns" not in currentSpec and "headings" not in currentSpec:
+            continue
+        spec = genWebrefSpec(currentSpec)
+        if isSpecInList(spec, specs):
+            # Skip specs that are already in Shepherd's database
+            continue
+
+        assert spec["vshortname"] is not None
+        specs[spec["vshortname"]] = spec
+        specHeadings: HeadingsT = {}
+        headings[spec["vshortname"]] = specHeadings
+        rawAnchorData: list[RawAnchorT] = []
+        if "dfns" in currentSpec:
+            currentAnchors = dataFromWebref(webrefAPIUrl + "ed/" + currentSpec["dfns"])
+            if currentAnchors:
+                rawAnchorData = [
+                    setStatus(convertWebrefAnchor(x, spec["current_url"]), "current") for x in currentAnchors["dfns"]
+                ]
+        if "headings" in currentSpec:
+            currentHeadings = dataFromWebref(webrefAPIUrl + "ed/" + currentSpec["headings"])
+            if currentHeadings:
+                rawAnchorData += [
+                    setStatus(convertWebrefHeading(x, spec["current_url"]), "current")
+                    for x in currentHeadings["headings"]
+                ]
+
+        # Complete list of anchors/headings with those from the snapshot version of the spec
+        if spec["snapshot_url"] is not None and snapshotWebrefData is not None:
+            snapshotSpec = [s for s in snapshotWebrefData if s["shortname"] == currentSpec["shortname"]]
+            if len(snapshotSpec) > 0:
+                snapshotSpec = snapshotSpec.pop()
+                if "dfns" in snapshotSpec:
+                    snapshotAnchors = dataFromWebref(webrefAPIUrl + "tr/" + snapshotSpec["dfns"])
+                    if snapshotAnchors:
+                        rawAnchorData += [
+                            setStatus(convertWebrefAnchor(x, spec["snapshot_url"]), "snapshot")
+                            for x in snapshotAnchors["dfns"]
+                        ]
+                if "headings" in snapshotSpec:
+                    snapshotHeadings = dataFromWebref(webrefAPIUrl + "tr/" + snapshotSpec["headings"])
+                    if snapshotHeadings:
+                        rawAnchorData += [
+                            setStatus(convertWebrefHeading(x, spec["snapshot_url"]), "snapshot")
+                            for x in snapshotHeadings["headings"]
+                        ]
+
+        if len(rawAnchorData) > 0:
+            processRawAnchors(rawAnchorData, anchors, specHeadings)
 
     cleanSpecHeadings(headings)
 
@@ -214,6 +333,21 @@ def dataFromApi(api: APIClient, *args: t.Any, **kwargs: t.Any) -> t.JSONT:
     return t.cast("t.JSONT", data)
 
 
+@tenacity.retry(reraise=True, stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_random(1, 2))
+def dataFromWebref(url: str) -> t.JSONT:
+    try:
+        response = requests.get(url)
+    except Exception as e:
+        raise Exception(f"Couldn't download data from Webref.\n{e}") from e
+    try:
+        data = response.json()
+    except Exception as e:
+        raise Exception(
+            f"Data retrieved from Webref wasn't valid JSON for some reason. Try downloading again?\n{e}"
+        ) from e
+    return data
+
+
 def linearizeAnchorTree(multiTree: list, rawAnchors: list[dict[str, t.Any]] | None = None) -> list[RawAnchorT]:
     if rawAnchors is None:
         rawAnchors = []
@@ -264,6 +398,70 @@ def genSpec(rawSpec: RawSpecT) -> SpecT:
     return spec
 
 
+def genWebrefSpec(rawSpec: RawWebrefSpecT) -> SpecT:
+    """Generate a spec object from data gleaned from Webref"""
+    assert rawSpec["shortname"] is not None
+    assert rawSpec["series"] is not None
+    assert rawSpec["nightly"] is not None
+    assert rawSpec["title"] is not None
+    assert rawSpec["shortTitle"] is not None
+    spec: SpecT = {
+        "vshortname": rawSpec["shortname"],
+        "shortname": rawSpec["series"]["shortname"],
+        "snapshot_url": rawSpec["release"]["url"] if "release" in rawSpec else None,
+        "current_url": rawSpec["nightly"]["url"],
+        "title": rawSpec["shortTitle"],
+        "description": rawSpec["title"],
+        "abstract": None,
+        "level": int(rawSpec["seriesVersion"])
+        if "seriesVersion" in rawSpec and re.match(r"^\d+$", rawSpec["seriesVersion"])
+        else None,
+    }
+    return spec
+
+
+def convertWebrefHeading(heading: RawWebrefHeadingT, specUrl: str) -> RawAnchorT:
+    """Convert a heading returned by Webref to the anchor format used in Shepherd"""
+    assert heading["id"] is not None
+    assert heading["title"] is not None
+    assert heading["href"] is not None
+    anchor: RawAnchorT = {
+        "name": heading["number"] if "number" in heading else heading["id"],
+        "type": "heading",
+        "for": [],
+        "section": True,
+        "title": heading["title"],
+        "status": "unknown",
+        "normative": True,
+        "export": "public",
+        "linking_text": [heading["title"]],
+        "uri": heading["href"].replace(specUrl, ""),
+    }
+    return anchor
+
+
+def convertWebrefAnchor(rawAnchor: RawWebrefAnchorT, specUrl: str) -> RawAnchorT:
+    """Convert an anchor returned by Webref to the anchor format used in Shepherd"""
+    assert rawAnchor["id"] is not None
+    assert rawAnchor["type"] is not None
+    assert rawAnchor["linkingText"] is not None
+    assert rawAnchor["access"] is not None
+    assert rawAnchor["href"] is not None
+    anchor: RawAnchorT = {
+        "name": rawAnchor["id"],
+        "type": rawAnchor["type"],
+        "for": rawAnchor["for"],
+        "section": False,
+        "title": rawAnchor["linkingText"][0],
+        "status": "unknown",
+        "normative": not rawAnchor["informative"],
+        "export": rawAnchor["access"] == "public",
+        "linking_text": rawAnchor["linkingText"],
+        "uri": rawAnchor["href"].replace(specUrl, ""),
+    }
+    return anchor
+
+
 def fixupAnchor(anchor: RawAnchorT) -> RawAnchorT:
     """Miscellaneous fixes to the anchors before I start processing"""