Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop instantiating character ranges #99

Merged
merged 53 commits into from
Aug 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
cbf883c
Stop allowing missing states in the map
qntm Jul 31, 2023
b6c284d
No more oblivion states
qntm Jul 31, 2023
e9785dd
Eliminate some complex logic for handling missing transitions
qntm Jul 31, 2023
f42949f
Introduce a function for character class repartitioning
qntm Jul 31, 2023
b17b948
A few more
qntm Jul 31, 2023
b453496
Merges?
qntm Aug 3, 2023
b210e8e
Merges.
qntm Aug 3, 2023
d03ca08
Always require ANYTHING_ELSE in the alphabet
qntm Aug 3, 2023
f4c0cc8
Eliminate logic accounting for ANYTHING_ELSE being missing
qntm Aug 3, 2023
5f0e1c0
Disallow a None state
qntm Aug 3, 2023
c16995f
Flip some logic
qntm Aug 3, 2023
cc96197
Allow an Fsm's alphabet to be replaced
qntm Aug 3, 2023
538d8da
Split out ANYTHING_ELSE, move Charclass Fsm constructor
qntm Aug 3, 2023
a29abbb
Move tests over
qntm Aug 3, 2023
8350680
Fsms now use Charclasses internally instead of single characters
qntm Aug 5, 2023
03d2e79
Introducing combine_alphabets, the worst code ever
qntm Aug 5, 2023
b4a7d6d
This is actually working, countdown to hitting a brick wall
qntm Aug 5, 2023
627bdbd
This is mad but we no longer use ANYTHING_ELSE inside Fsm
qntm Aug 5, 2023
fa8a976
Stop allowing ANYTHING_ELSE in Fsm constructions
qntm Aug 5, 2023
68d6406
ANYTHING_ELSE is dead
qntm Aug 5, 2023
77c9e0e
Some simplifications
qntm Aug 5, 2023
fcf2a91
All symbols must now be Charclasses
qntm Aug 5, 2023
146c127
Some simplifications
qntm Aug 5, 2023
564a6e3
Rename get_chars back to alphabet
qntm Aug 5, 2023
269a587
Fix Charclass sorting
qntm Aug 5, 2023
87cba49
Only allow integer states
qntm Aug 5, 2023
1df2cf5
Formatting
qntm Aug 5, 2023
2a9babf
Fix all type errors
qntm Aug 5, 2023
1459cc0
Placate the linter
qntm Aug 5, 2023
2de274f
Final
qntm Aug 5, 2023
618b8fb
Stop requiring an alphabet for null and epsilon
qntm Aug 5, 2023
fc1203b
Make those constants
qntm Aug 5, 2023
3021065
Apply some simplifications
qntm Aug 5, 2023
33712e8
Allow larger Charclasses and fix tests
qntm Aug 5, 2023
ec18773
Guess we don't need this!
qntm Aug 5, 2023
c5b901b
Charclasses now use single-character ranges internally
qntm Aug 6, 2023
0c6e513
Alter the Charclass constructor to require ranges
qntm Aug 6, 2023
c6fef8a
Private API for Charclass
qntm Aug 6, 2023
3e68d9c
Halfway there
qntm Aug 6, 2023
2d79228
2/3rds of the way there
qntm Aug 6, 2023
8004f2f
Nightmare code
qntm Aug 6, 2023
b6598db
THAT WAS HAAARD
qntm Aug 6, 2023
07d5001
It works.
qntm Aug 6, 2023
e3cc855
Simplify tests again
qntm Aug 6, 2023
466b5a6
Well it's working...
qntm Aug 6, 2023
566c5ee
Fix some performance issues
qntm Aug 6, 2023
09222f9
A final performance thing?
qntm Aug 6, 2023
2d4274b
Penultimateness
qntm Aug 6, 2023
b2b8e6c
Lint
qntm Aug 6, 2023
3a72904
Lint 2
qntm Aug 6, 2023
7ff30be
Lint 3
qntm Aug 6, 2023
a2ffff8
Lint 4 (?)
qntm Aug 6, 2023
ef14e4e
Lint 5
qntm Aug 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
334 changes: 223 additions & 111 deletions greenery/charclass.py

Large diffs are not rendered by default.

226 changes: 184 additions & 42 deletions greenery/charclass_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

import pytest

from .charclass import (
DIGIT,
DOT,
Expand All @@ -12,8 +10,49 @@
SPACECHAR,
WORDCHAR,
Charclass,
add_ord_range,
repartition,
)
from .fsm import ANYTHING_ELSE


def test_add_ord_range_0() -> None:
assert add_ord_range([], (1, 2)) == [(1, 2)]


def test_add_ord_range_1a() -> None:
assert add_ord_range(
[(1, 1), (3, 4), (10, 11), (13, 17)],
(7, 7),
) == [(1, 1), (3, 4), (7, 7), (10, 11), (13, 17)]


def test_add_ord_range_1b() -> None:
assert add_ord_range([(5, 16)], (1, 1)) == [(1, 1), (5, 16)]
assert add_ord_range([(5, 16)], (1, 2)) == [(1, 2), (5, 16)]
assert add_ord_range([(5, 16)], (1, 3)) == [(1, 3), (5, 16)]
assert add_ord_range([(5, 16)], (1, 4)) == [(1, 16)]
assert add_ord_range([(5, 16)], (1, 5)) == [(1, 16)]
assert add_ord_range([(5, 16)], (1, 16)) == [(1, 16)]
assert add_ord_range([(5, 16)], (1, 17)) == [(1, 17)]
assert add_ord_range([(5, 16)], (1, 18)) == [(1, 18)]
assert add_ord_range([(5, 16)], (4, 4)) == [(4, 16)]
assert add_ord_range([(5, 16)], (5, 5)) == [(5, 16)]
assert add_ord_range([(5, 16)], (5, 18)) == [(5, 18)]
assert add_ord_range([(5, 16)], (7, 8)) == [(5, 16)]
assert add_ord_range([(5, 16)], (10, 20)) == [(5, 20)]
assert add_ord_range([(5, 16)], (16, 20)) == [(5, 20)]
assert add_ord_range([(5, 16)], (17, 20)) == [(5, 20)]
assert add_ord_range([(5, 16)], (18, 20)) == [(5, 16), (18, 20)]


def test_add_ord_range_2() -> None:
assert add_ord_range([(1, 2), (11, 12)], (5, 6)) == [(1, 2), (5, 6), (11, 12)]
assert add_ord_range([(1, 2), (11, 12)], (3, 6)) == [(1, 6), (11, 12)]
assert add_ord_range([(1, 2), (11, 12)], (2, 6)) == [(1, 6), (11, 12)]
assert add_ord_range([(1, 2), (11, 12)], (5, 9)) == [(1, 2), (5, 9), (11, 12)]
assert add_ord_range([(1, 2), (11, 12)], (5, 10)) == [(1, 2), (5, 12)]
assert add_ord_range([(1, 2), (11, 12)], (-2, -1)) == [(-2, -1), (1, 2), (11, 12)]
assert add_ord_range([(1, 2), (11, 12)], (0, 20)) == [(0, 20)]


def test_charclass_equality() -> None:
Expand All @@ -24,21 +63,22 @@ def test_charclass_equality() -> None:


def test_charclass_ctor() -> None:
with pytest.raises(TypeError):
Charclass(frozenset({"a", ANYTHING_ELSE})) # type: ignore

with pytest.raises(ValueError):
Charclass(frozenset({"a", "aa"}))

assert Charclass("ab") == Charclass(frozenset({"a", "b"}))

assert not Charclass("ab").negated
assert not Charclass("ab", negated=False).negated
assert Charclass("ab", negated=True).negated


def test_repr() -> None:
assert repr(~Charclass("a")) == "~Charclass('a')"
assert repr(~Charclass("a")) == "~Charclass((('a', 'a'),))"


def test_issubset() -> None:
assert Charclass("a").issubset(Charclass("a"))
assert not Charclass("a").issubset(Charclass("b"))
assert Charclass("a").issubset(Charclass((("a", "b"),)))
assert Charclass("a").issubset(~Charclass("b"))
assert not (~Charclass("a")).issubset(Charclass("b"))
assert ~Charclass("a").issubset(DOT)


def test_charclass_str() -> None:
Expand Down Expand Up @@ -84,15 +124,6 @@ def test_charclass_str() -> None:
assert str(~Charclass("^")) == "[^\\^]"


def test_charclass_fsm() -> None:
# "[^a]"
nota = (~Charclass("a")).to_fsm()
assert nota.alphabet == {"a", ANYTHING_ELSE}
assert nota.accepts("b")
assert nota.accepts(["b"])
assert nota.accepts([ANYTHING_ELSE])


def test_charclass_negation() -> None:
assert ~~Charclass("a") == Charclass("a")
assert Charclass("a") == ~~Charclass("a")
Expand All @@ -103,24 +134,11 @@ def test_charclass_union() -> None:
assert Charclass("ab") | Charclass("bc") == Charclass("abc")
# [ab] ∪ [^bc] = [^c]
assert Charclass("ab") | ~Charclass("bc") == ~Charclass("c")
# [^a] ∪ [bc] = [^a]
# [^ab] ∪ [bc] = [^a]
assert ~Charclass("ab") | Charclass("bc") == ~Charclass("a")
# [^ab] ∪ [^bc] = [^b]
assert ~Charclass("ab") | ~Charclass("bc") == ~Charclass("b")

assert Charclass.union() == NULLCHARCLASS

assert Charclass.union(
Charclass("ab"),
Charclass("a"),
Charclass("cd"),
) == Charclass("abcd")

assert Charclass.union(
Charclass("ab"),
~Charclass("abc"),
) == ~Charclass("c")


def test_charclass_intersection() -> None:
# [ab] ∩ [bc] = [b]
Expand All @@ -132,15 +150,139 @@ def test_charclass_intersection() -> None:
# [^ab] ∩ [^bc] = [^abc]
assert ~Charclass("ab") & ~Charclass("bc") == ~Charclass("abc")

assert Charclass.intersection(
Charclass("ab"),
Charclass("bcd"),
Charclass("abcde"),
) == Charclass("b")

assert Charclass.intersection() == ~NULLCHARCLASS
assert (Charclass("ab") & Charclass("bcd") & Charclass("abcde")) == Charclass("b")


def test_empty() -> None:
assert NULLCHARCLASS.empty()
assert not DOT.empty()


def test_repartition_elementary() -> None:
assert repartition([Charclass("a")]) == {
Charclass("a"): [Charclass("a")],
}


def test_repartition_elementary_2() -> None:
assert repartition([Charclass("a"), ~Charclass("a")]) == {
Charclass("a"): [Charclass("a")],
~Charclass("a"): [~Charclass("a")],
}


def test_repartition_basic() -> None:
assert repartition([Charclass("a"), Charclass("abc")]) == {
Charclass("a"): [
Charclass("a"),
],
Charclass("abc"): [
Charclass("a"),
Charclass("bc"),
],
}


def test_repartition_negation() -> None:
assert repartition([Charclass("ab"), Charclass("a"), ~Charclass("ab")]) == {
Charclass("ab"): [
Charclass("a"),
Charclass("b"),
],
Charclass("a"): [
Charclass("a"),
],
~Charclass("ab"): [
~Charclass("ab"),
],
}


def test_repartition_negation_2() -> None:
assert repartition([Charclass("ab"), Charclass("abc"), ~Charclass("ab")]) == {
Charclass("ab"): [
Charclass("ab"),
],
Charclass("abc"): [
Charclass("ab"),
Charclass("c"),
],
~Charclass("ab"): [
~Charclass("abc"),
Charclass("c"),
],
}
assert repartition(
[
~Charclass("a"),
~Charclass("ab"),
~Charclass("abc"),
]
) == {
~Charclass("a"): [
~Charclass("abc"),
Charclass("b"),
Charclass("c"),
],
~Charclass("ab"): [
~Charclass("abc"),
Charclass("c"),
],
~Charclass("abc"): [
~Charclass("abc"),
],
}


def test_repartition_advanced() -> None:
assert repartition(
[
Charclass("a"),
Charclass("bcdef"),
~Charclass("abcdef"),
Charclass("abcd"),
~Charclass("abcd"),
]
) == {
Charclass("a"): [Charclass("a")],
Charclass("bcdef"): [
Charclass("bcd"),
Charclass("ef"),
],
~Charclass("abcdef"): [
~Charclass("abcdef"),
],
Charclass("abcd"): [
Charclass("a"),
Charclass("bcd"),
],
~Charclass("abcd"): [
~Charclass("abcdef"),
Charclass("ef"),
],
}


def test_repartition_advanced_2() -> None:
assert repartition([WORDCHAR, DIGIT, DOT, NONDIGITCHAR, NULLCHARCLASS]) == {
WORDCHAR: [
DIGIT,
Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"),
],
DIGIT: [DIGIT],
DOT: [
~Charclass((("0", "z"),)),
DIGIT,
Charclass(((":", "@"), ("[", "^"), ("`", "`"))),
Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"),
],
NONDIGITCHAR: [
~Charclass((("0", "z"),)),
Charclass(((":", "@"), ("[", "^"), ("`", "`"))),
Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"),
],
NULLCHARCLASS: [
# Yup, there's nothing here!
# This should be impossible or at least cause no problems in practice
],
}
Loading