Skip to content

Commit

Permalink
Add scan_wikitext example
Browse files Browse the repository at this point in the history
  • Loading branch information
MegaIng committed Jun 20, 2024
1 parent 884d18b commit d0d9fcc
Showing 1 changed file with 42 additions and 0 deletions.
42 changes: 42 additions & 0 deletions examples/advanced/scan_wikitext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Showcases how to use `Lark.scan` to select a pattern from a larger text without having to parse all of it.
Uses `requests` to fetch the current wikitext from `Python (Programming Language)` and uses a simple grammar
to extract all wikitext templates used in the page.
"""

from collections import Counter
from pprint import pprint

import lark
import requests

page_name = "Python_(programming_language)"
url = f"https://en.wikipedia.org/wiki/{page_name}?action=raw"

wikitext = requests.get(url).text

grammar = r"""
template: "{{" TEXT ("|" argument)* "}}"
text: (TEXT|template)+
argument: /\w+(?==)/ "=" text -> named_argument
| text -> numbered_argument
TEXT: / (?:[^{}|]
| \{(?!\{)
| \}(?!\})
)+/x
"""
parser = lark.Lark(grammar, parser='lalr', start='template')
used_templates = Counter()
inner_templates = 0
for (start, end), res in parser.scan(wikitext):
for temp in res.find_data('template'):
if temp != res:
inner_templates += 1
used_templates[temp.children[0].value] += 1

pprint(used_templates)
print("Total templates used:", used_templates.total())
print("Number of templates nested inside others:", inner_templates)

0 comments on commit d0d9fcc

Please sign in to comment.