Skip to content

Commit

Permalink
Complete anchors/headings with data from Webref (#2426)
Browse files Browse the repository at this point in the history
  • Loading branch information
tidoust authored Dec 19, 2022
1 parent 6bd0f75 commit eacff10
Showing 1 changed file with 217 additions and 19 deletions.
236 changes: 217 additions & 19 deletions bikeshed/update/updateCrossRefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import certifi
import tenacity
import requests
from json_home_client import Client as APIClient

from .. import config, messages as m, t
Expand Down Expand Up @@ -45,6 +46,32 @@ class HeadingT(t.TypedDict):
total=False,
)

RawWebrefAnchorT = t.TypedDict(
"RawWebrefAnchorT",
{
"id": t.Required[str],
"type": t.Required[str],
"for": list[str],
"informative": bool,
"access": t.Required[str],
"linkingText": list[str],
"href": t.Required[str],
},
total=False,
)

RawWebrefHeadingT = t.TypedDict(
"RawWebrefHeadingT",
{
"id": t.Required[str],
"href": t.Required[str],
"title": t.Required[str],
"level": t.Required[int],
"number": t.Required[str],
},
total=False,
)

# Need to use function form due to "for" key
# being invalid as a property name
AnchorT = t.TypedDict(
Expand Down Expand Up @@ -72,6 +99,16 @@ class RawSpecT(t.TypedDict, total=False):
draft_uri: str | None
abstract: str | None

class RawWebrefSpecT(t.TypedDict, total=False):
url: t.Required[str]
shortname: t.Required[str]
title: t.Required[str]
shortTitle: t.Required[str]
series: t.Required(dict[str, str])
nightly: t.Required(dict[str, str | list(str)])
release: dict[str, str | list(str)] | None
seriesVersion: str | None

class SpecT(t.TypedDict):
vshortname: str
shortname: str
Expand All @@ -88,7 +125,38 @@ def progressMessager(index: int, total: int) -> t.Callable[[], None]:


def update(path: str, dryRun: bool = False) -> set[str] | None:
m.say("Downloading anchor data...")
def setStatus(obj: RawAnchorT, status: str) -> RawAnchorT:
obj["status"] = status
return obj

def isSpecInList(spec: SpecT, specs: SpecsT) -> bool:
if spec["vshortname"].lower() in specs:
return True
for s in specs.values():
# Shortnames don't always match. Let's compare URLs
if (s["snapshot_url"] is not None and s["snapshot_url"] == spec["snapshot_url"]) or (
s["current_url"] is not None and s["current_url"] == spec["current_url"]
):
return True
return False

def processRawAnchors(rawAnchorData: list(RawAnchorT), anchors: AnchorsT, specHeadings: AllHeadingsT) -> None:
for rawAnchor in rawAnchorData:
rawAnchor = fixupAnchor(rawAnchor)
linkingTexts = rawAnchor["linking_text"]
assert linkingTexts is not None
if len(linkingTexts) == 0:
# Happens if it had no linking text at all originally
continue
if len(linkingTexts) == 1 and linkingTexts[0].strip() == "":
# Happens if it was marked with an empty lt and Shepherd still picked it up
continue
if "section" in rawAnchor and rawAnchor["section"] is True:
addToHeadings(rawAnchor, specHeadings, spec=spec)
if rawAnchor["type"] in config.dfnTypes.union(["dfn"]):
addToAnchors(rawAnchor, anchors, spec=spec)

m.say("Downloading anchor data from Shepherd...")
shepherd = APIClient(
"https://api.csswg.org/shepherd/",
version="vnd.csswg.shepherd.v1",
Expand All @@ -115,27 +183,78 @@ def update(path: str, dryRun: bool = False) -> set[str] | None:
specHeadings: HeadingsT = {}
headings[spec["vshortname"]] = specHeadings

def setStatus(obj: RawAnchorT, status: str) -> RawAnchorT:
obj["status"] = status
return obj

rawAnchorData = [setStatus(x, "snapshot") for x in linearizeAnchorTree(rawSpec.get("anchors", []))] + [
setStatus(x, "current") for x in linearizeAnchorTree(rawSpec.get("draft_anchors", []))
]
for rawAnchor in rawAnchorData:
rawAnchor = fixupAnchor(rawAnchor)
linkingTexts = rawAnchor["linking_text"]
assert linkingTexts is not None
if len(linkingTexts) == 0:
# Happens if it had no linking text at all originally
continue
if len(linkingTexts) == 1 and linkingTexts[0].strip() == "":
# Happens if it was marked with an empty lt and Shepherd still picked it up
continue
if "section" in rawAnchor and rawAnchor["section"] is True:
addToHeadings(rawAnchor, specHeadings, spec=spec)
if rawAnchor["type"] in config.dfnTypes.union(["dfn"]):
addToAnchors(rawAnchor, anchors, spec=spec)
processRawAnchors(rawAnchorData, anchors, specHeadings)

m.say("Downloading anchor data from Webref...")
webrefAPIUrl = "https://raw.githubusercontent.com/w3c/webref/main/"
currentWebrefData = dataFromWebref(webrefAPIUrl + "ed/index.json")
if not currentWebrefData or "results" not in currentWebrefData or currentWebrefData["results"] is None:
return None
currentWebrefData = currentWebrefData["results"]
snapshotWebrefData = dataFromWebref(webrefAPIUrl + "tr/index.json")
snapshotWebrefData = (
snapshotWebrefData["results"]
if snapshotWebrefData and "results" in snapshotWebrefData and snapshotWebrefData["results"] is not None
else None
)

for i, currentSpec in enumerate(currentWebrefData, 1):
lastMsgTime = config.doEvery(
s=5,
lastTime=lastMsgTime,
action=progressMessager(i, len(currentWebrefData)),
)
if "dfns" not in currentSpec and "headings" not in currentSpec:
continue
spec = genWebrefSpec(currentSpec)
if isSpecInList(spec, specs):
# Skip specs that are already in Shepherd's database
continue

assert spec["vshortname"] is not None
specs[spec["vshortname"]] = spec
specHeadings: HeadingsT = {}
headings[spec["vshortname"]] = specHeadings
rawAnchorData: list[RawAnchorT] = []
if "dfns" in currentSpec:
currentAnchors = dataFromWebref(webrefAPIUrl + "ed/" + currentSpec["dfns"])
if currentAnchors:
rawAnchorData = [
setStatus(convertWebrefAnchor(x, spec["current_url"]), "current") for x in currentAnchors["dfns"]
]
if "headings" in currentSpec:
currentHeadings = dataFromWebref(webrefAPIUrl + "ed/" + currentSpec["headings"])
if currentHeadings:
rawAnchorData += [
setStatus(convertWebrefHeading(x, spec["current_url"]), "current")
for x in currentHeadings["headings"]
]

# Complete list of anchors/headings with those from the snapshot version of the spec
if spec["snapshot_url"] is not None and snapshotWebrefData is not None:
snapshotSpec = [s for s in snapshotWebrefData if s["shortname"] == currentSpec["shortname"]]
if len(snapshotSpec) > 0:
snapshotSpec = snapshotSpec.pop()
if "dfns" in snapshotSpec:
snapshotAnchors = dataFromWebref(webrefAPIUrl + "tr/" + snapshotSpec["dfns"])
if snapshotAnchors:
rawAnchorData += [
setStatus(convertWebrefAnchor(x, spec["snapshot_url"]), "snapshot")
for x in snapshotAnchors["dfns"]
]
if "headings" in snapshotSpec:
snapshotHeadings = dataFromWebref(webrefAPIUrl + "tr/" + snapshotSpec["headings"])
if snapshotHeadings:
rawAnchorData += [
setStatus(convertWebrefHeading(x, spec["snapshot_url"]), "snapshot")
for x in snapshotHeadings["headings"]
]

if len(rawAnchorData) > 0:
processRawAnchors(rawAnchorData, anchors, specHeadings)

cleanSpecHeadings(headings)

Expand Down Expand Up @@ -214,6 +333,21 @@ def dataFromApi(api: APIClient, *args: t.Any, **kwargs: t.Any) -> t.JSONT:
return t.cast("t.JSONT", data)


@tenacity.retry(reraise=True, stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_random(1, 2))
def dataFromWebref(url: str) -> t.JSONT:
try:
response = requests.get(url)
except Exception as e:
raise Exception(f"Couldn't download data from Webref.\n{e}") from e
try:
data = response.json()
except Exception as e:
raise Exception(
f"Data retrieved from Webref wasn't valid JSON for some reason. Try downloading again?\n{e}"
) from e
return data


def linearizeAnchorTree(multiTree: list, rawAnchors: list[dict[str, t.Any]] | None = None) -> list[RawAnchorT]:
if rawAnchors is None:
rawAnchors = []
Expand Down Expand Up @@ -264,6 +398,70 @@ def genSpec(rawSpec: RawSpecT) -> SpecT:
return spec


def genWebrefSpec(rawSpec: RawWebrefSpecT) -> SpecT:
"""Generate a spec object from data gleaned from Webref"""
assert rawSpec["shortname"] is not None
assert rawSpec["series"] is not None
assert rawSpec["nightly"] is not None
assert rawSpec["title"] is not None
assert rawSpec["shortTitle"] is not None
spec: SpecT = {
"vshortname": rawSpec["shortname"],
"shortname": rawSpec["series"]["shortname"],
"snapshot_url": rawSpec["release"]["url"] if "release" in rawSpec else None,
"current_url": rawSpec["nightly"]["url"],
"title": rawSpec["shortTitle"],
"description": rawSpec["title"],
"abstract": None,
"level": int(rawSpec["seriesVersion"])
if "seriesVersion" in rawSpec and re.match(r"^\d+$", rawSpec["seriesVersion"])
else None,
}
return spec


def convertWebrefHeading(heading: RawWebrefHeadingT, specUrl: str) -> RawAnchorT:
"""Convert a heading returned by Webref to the anchor format used in Shepherd"""
assert heading["id"] is not None
assert heading["title"] is not None
assert heading["href"] is not None
anchor: RawAnchorT = {
"name": heading["number"] if "number" in heading else heading["id"],
"type": "heading",
"for": [],
"section": True,
"title": heading["title"],
"status": "unknown",
"normative": True,
"export": "public",
"linking_text": [heading["title"]],
"uri": heading["href"].replace(specUrl, ""),
}
return anchor


def convertWebrefAnchor(rawAnchor: RawWebrefAnchorT, specUrl: str) -> RawAnchorT:
"""Convert an anchor returned by Webref to the anchor format used in Shepherd"""
assert rawAnchor["id"] is not None
assert rawAnchor["type"] is not None
assert rawAnchor["linkingText"] is not None
assert rawAnchor["access"] is not None
assert rawAnchor["href"] is not None
anchor: RawAnchorT = {
"name": rawAnchor["id"],
"type": rawAnchor["type"],
"for": rawAnchor["for"],
"section": False,
"title": rawAnchor["linkingText"][0],
"status": "unknown",
"normative": not rawAnchor["informative"],
"export": rawAnchor["access"] == "public",
"linking_text": rawAnchor["linkingText"],
"uri": rawAnchor["href"].replace(specUrl, ""),
}
return anchor


def fixupAnchor(anchor: RawAnchorT) -> RawAnchorT:
"""Miscellaneous fixes to the anchors before I start processing"""

Expand Down

0 comments on commit eacff10

Please sign in to comment.