diff --git a/bikeshed/update/updateCrossRefs.py b/bikeshed/update/updateCrossRefs.py index 034068fae6..ba9052a191 100644 --- a/bikeshed/update/updateCrossRefs.py +++ b/bikeshed/update/updateCrossRefs.py @@ -7,6 +7,7 @@ import certifi import tenacity +import requests from json_home_client import Client as APIClient from .. import config, messages as m, t @@ -45,6 +46,32 @@ class HeadingT(t.TypedDict): total=False, ) + RawWebrefAnchorT = t.TypedDict( + "RawWebrefAnchorT", + { + "id": t.Required[str], + "type": t.Required[str], + "for": list[str], + "informative": bool, + "access": t.Required[str], + "linkingText": list[str], + "href": t.Required[str], + }, + total=False, + ) + + RawWebrefHeadingT = t.TypedDict( + "RawWebrefHeadingT", + { + "id": t.Required[str], + "href": t.Required[str], + "title": t.Required[str], + "level": t.Required[int], + "number": t.Required[str], + }, + total=False, + ) + # Need to use function form due to "for" key # being invalid as a property name AnchorT = t.TypedDict( @@ -72,6 +99,16 @@ class RawSpecT(t.TypedDict, total=False): draft_uri: str | None abstract: str | None + class RawWebrefSpecT(t.TypedDict, total=False): + url: t.Required[str] + shortname: t.Required[str] + title: t.Required[str] + shortTitle: t.Required[str] + series: t.Required(dict[str, str]) + nightly: t.Required(dict[str, str | list(str)]) + release: dict[str, str | list(str)] | None + seriesVersion: str | None + class SpecT(t.TypedDict): vshortname: str shortname: str @@ -88,7 +125,38 @@ def progressMessager(index: int, total: int) -> t.Callable[[], None]: def update(path: str, dryRun: bool = False) -> set[str] | None: - m.say("Downloading anchor data...") + def setStatus(obj: RawAnchorT, status: str) -> RawAnchorT: + obj["status"] = status + return obj + + def isSpecInList(spec: SpecT, specs: SpecsT) -> bool: + if spec["vshortname"].lower() in specs: + return True + for s in specs.values(): + # Shortnames don't always match. Let's compare URLs + if (s["snapshot_url"] is not None and s["snapshot_url"] == spec["snapshot_url"]) or ( + s["current_url"] is not None and s["current_url"] == spec["current_url"] + ): + return True + return False + + def processRawAnchors(rawAnchorData: list(RawAnchorT), anchors: AnchorsT, specHeadings: AllHeadingsT) -> None: + for rawAnchor in rawAnchorData: + rawAnchor = fixupAnchor(rawAnchor) + linkingTexts = rawAnchor["linking_text"] + assert linkingTexts is not None + if len(linkingTexts) == 0: + # Happens if it had no linking text at all originally + continue + if len(linkingTexts) == 1 and linkingTexts[0].strip() == "": + # Happens if it was marked with an empty lt and Shepherd still picked it up + continue + if "section" in rawAnchor and rawAnchor["section"] is True: + addToHeadings(rawAnchor, specHeadings, spec=spec) + if rawAnchor["type"] in config.dfnTypes.union(["dfn"]): + addToAnchors(rawAnchor, anchors, spec=spec) + + m.say("Downloading anchor data from Shepherd...") shepherd = APIClient( "https://api.csswg.org/shepherd/", version="vnd.csswg.shepherd.v1", @@ -115,27 +183,78 @@ def update(path: str, dryRun: bool = False) -> set[str] | None: specHeadings: HeadingsT = {} headings[spec["vshortname"]] = specHeadings - def setStatus(obj: RawAnchorT, status: str) -> RawAnchorT: - obj["status"] = status - return obj - rawAnchorData = [setStatus(x, "snapshot") for x in linearizeAnchorTree(rawSpec.get("anchors", []))] + [ setStatus(x, "current") for x in linearizeAnchorTree(rawSpec.get("draft_anchors", [])) ] - for rawAnchor in rawAnchorData: - rawAnchor = fixupAnchor(rawAnchor) - linkingTexts = rawAnchor["linking_text"] - assert linkingTexts is not None - if len(linkingTexts) == 0: - # Happens if it had no linking text at all originally - continue - if len(linkingTexts) == 1 and linkingTexts[0].strip() == "": - # Happens if it was marked with an empty lt and Shepherd still picked it up - continue - if "section" in rawAnchor and rawAnchor["section"] is True: - addToHeadings(rawAnchor, specHeadings, spec=spec) - if rawAnchor["type"] in config.dfnTypes.union(["dfn"]): - addToAnchors(rawAnchor, anchors, spec=spec) + processRawAnchors(rawAnchorData, anchors, specHeadings) + + m.say("Downloading anchor data from Webref...") + webrefAPIUrl = "https://raw.githubusercontent.com/w3c/webref/main/" + currentWebrefData = dataFromWebref(webrefAPIUrl + "ed/index.json") + if not currentWebrefData or "results" not in currentWebrefData or currentWebrefData["results"] is None: + return None + currentWebrefData = currentWebrefData["results"] + snapshotWebrefData = dataFromWebref(webrefAPIUrl + "tr/index.json") + snapshotWebrefData = ( + snapshotWebrefData["results"] + if snapshotWebrefData and "results" in snapshotWebrefData and snapshotWebrefData["results"] is not None + else None + ) + + for i, currentSpec in enumerate(currentWebrefData, 1): + lastMsgTime = config.doEvery( + s=5, + lastTime=lastMsgTime, + action=progressMessager(i, len(currentWebrefData)), + ) + if "dfns" not in currentSpec and "headings" not in currentSpec: + continue + spec = genWebrefSpec(currentSpec) + if isSpecInList(spec, specs): + # Skip specs that are already in Shepherd's database + continue + + assert spec["vshortname"] is not None + specs[spec["vshortname"]] = spec + specHeadings: HeadingsT = {} + headings[spec["vshortname"]] = specHeadings + rawAnchorData: list[RawAnchorT] = [] + if "dfns" in currentSpec: + currentAnchors = dataFromWebref(webrefAPIUrl + "ed/" + currentSpec["dfns"]) + if currentAnchors: + rawAnchorData = [ + setStatus(convertWebrefAnchor(x, spec["current_url"]), "current") for x in currentAnchors["dfns"] + ] + if "headings" in currentSpec: + currentHeadings = dataFromWebref(webrefAPIUrl + "ed/" + currentSpec["headings"]) + if currentHeadings: + rawAnchorData += [ + setStatus(convertWebrefHeading(x, spec["current_url"]), "current") + for x in currentHeadings["headings"] + ] + + # Complete list of anchors/headings with those from the snapshot version of the spec + if spec["snapshot_url"] is not None and snapshotWebrefData is not None: + snapshotSpec = [s for s in snapshotWebrefData if s["shortname"] == currentSpec["shortname"]] + if len(snapshotSpec) > 0: + snapshotSpec = snapshotSpec.pop() + if "dfns" in snapshotSpec: + snapshotAnchors = dataFromWebref(webrefAPIUrl + "tr/" + snapshotSpec["dfns"]) + if snapshotAnchors: + rawAnchorData += [ + setStatus(convertWebrefAnchor(x, spec["snapshot_url"]), "snapshot") + for x in snapshotAnchors["dfns"] + ] + if "headings" in snapshotSpec: + snapshotHeadings = dataFromWebref(webrefAPIUrl + "tr/" + snapshotSpec["headings"]) + if snapshotHeadings: + rawAnchorData += [ + setStatus(convertWebrefHeading(x, spec["snapshot_url"]), "snapshot") + for x in snapshotHeadings["headings"] + ] + + if len(rawAnchorData) > 0: + processRawAnchors(rawAnchorData, anchors, specHeadings) cleanSpecHeadings(headings) @@ -214,6 +333,21 @@ def dataFromApi(api: APIClient, *args: t.Any, **kwargs: t.Any) -> t.JSONT: return t.cast("t.JSONT", data) +@tenacity.retry(reraise=True, stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_random(1, 2)) +def dataFromWebref(url: str) -> t.JSONT: + try: + response = requests.get(url) + except Exception as e: + raise Exception(f"Couldn't download data from Webref.\n{e}") from e + try: + data = response.json() + except Exception as e: + raise Exception( + f"Data retrieved from Webref wasn't valid JSON for some reason. Try downloading again?\n{e}" + ) from e + return data + + def linearizeAnchorTree(multiTree: list, rawAnchors: list[dict[str, t.Any]] | None = None) -> list[RawAnchorT]: if rawAnchors is None: rawAnchors = [] @@ -264,6 +398,70 @@ def genSpec(rawSpec: RawSpecT) -> SpecT: return spec +def genWebrefSpec(rawSpec: RawWebrefSpecT) -> SpecT: + """Generate a spec object from data gleaned from Webref""" + assert rawSpec["shortname"] is not None + assert rawSpec["series"] is not None + assert rawSpec["nightly"] is not None + assert rawSpec["title"] is not None + assert rawSpec["shortTitle"] is not None + spec: SpecT = { + "vshortname": rawSpec["shortname"], + "shortname": rawSpec["series"]["shortname"], + "snapshot_url": rawSpec["release"]["url"] if "release" in rawSpec else None, + "current_url": rawSpec["nightly"]["url"], + "title": rawSpec["shortTitle"], + "description": rawSpec["title"], + "abstract": None, + "level": int(rawSpec["seriesVersion"]) + if "seriesVersion" in rawSpec and re.match(r"^\d+$", rawSpec["seriesVersion"]) + else None, + } + return spec + + +def convertWebrefHeading(heading: RawWebrefHeadingT, specUrl: str) -> RawAnchorT: + """Convert a heading returned by Webref to the anchor format used in Shepherd""" + assert heading["id"] is not None + assert heading["title"] is not None + assert heading["href"] is not None + anchor: RawAnchorT = { + "name": heading["number"] if "number" in heading else heading["id"], + "type": "heading", + "for": [], + "section": True, + "title": heading["title"], + "status": "unknown", + "normative": True, + "export": "public", + "linking_text": [heading["title"]], + "uri": heading["href"].replace(specUrl, ""), + } + return anchor + + +def convertWebrefAnchor(rawAnchor: RawWebrefAnchorT, specUrl: str) -> RawAnchorT: + """Convert an anchor returned by Webref to the anchor format used in Shepherd""" + assert rawAnchor["id"] is not None + assert rawAnchor["type"] is not None + assert rawAnchor["linkingText"] is not None + assert rawAnchor["access"] is not None + assert rawAnchor["href"] is not None + anchor: RawAnchorT = { + "name": rawAnchor["id"], + "type": rawAnchor["type"], + "for": rawAnchor["for"], + "section": False, + "title": rawAnchor["linkingText"][0], + "status": "unknown", + "normative": not rawAnchor["informative"], + "export": rawAnchor["access"] == "public", + "linking_text": rawAnchor["linkingText"], + "uri": rawAnchor["href"].replace(specUrl, ""), + } + return anchor + + def fixupAnchor(anchor: RawAnchorT) -> RawAnchorT: """Miscellaneous fixes to the anchors before I start processing"""