-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor into separate demos, add comparison demo
- Loading branch information
Showing
10 changed files
with
1,179 additions
and
1,122 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,207 +1,48 @@ | ||
import io | ||
import base64 | ||
import gradio as gr | ||
import iscc_core as ic | ||
import iscc_sdk as idk | ||
from PIL import Image | ||
|
||
idk.sdk_opts.image_thumbnail_size = 265 | ||
idk.sdk_opts.image_thumbnail_quality = 80 | ||
|
||
from demos.generate import demo as demo_generate | ||
from demos.compare import demo as demo_compare | ||
from demos.inspect_ import demo as demo_inspect | ||
from demos.chunker import demo as demo_chunker | ||
|
||
custom_css = """ | ||
.fixed-height img { | ||
height: 265px; /* Fixed height */ | ||
.fixed-height { | ||
height: 240px; /* Fixed height */ | ||
object-fit: contain; /* Scale the image to fit within the element */ | ||
} | ||
#chunked-text span.label { | ||
text-transform: none !important; | ||
} | ||
""" | ||
|
||
newline_symbols = { | ||
"\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol | ||
"\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol | ||
"\u000c": "␌", # Form Feed - Unicode Control Pictures representation | ||
"\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol | ||
"\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol | ||
"\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol | ||
"\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol | ||
.json-holder { | ||
word-wrap: break-word; | ||
white-space: pre-wrap; | ||
} | ||
#examples-a, #examples-b { | ||
height: 140px; /* Fixed height */ | ||
object-fit: contain; /* Scale the image to fit within the element */ | ||
} | ||
def no_nl(text): | ||
for char, symbol in newline_symbols.items(): | ||
text = text.replace(char, symbol) | ||
return text | ||
|
||
|
||
def generate_iscc(file): | ||
imeta = idk.code_iscc(file.name) | ||
thumbnail = None | ||
if imeta.thumbnail: | ||
header, encoded = imeta.thumbnail.split(",", 1) | ||
data = base64.b64decode(encoded) | ||
thumbnail = Image.open(io.BytesIO(data)) | ||
metadata = imeta.dict(exclude_unset=False, by_alias=True) | ||
if metadata.get("thumbnail"): | ||
del metadata["thumbnail"] | ||
return imeta.iscc, thumbnail, metadata | ||
|
||
|
||
def explain_iscc(code): | ||
canonical = ic.iscc_normalize(code) | ||
human = " - ".join(ic.iscc_explain(code).split("-")) | ||
code_obj = ic.Code(canonical) | ||
decomposed = " - ".join(ic.iscc_decompose(canonical)) | ||
multiformat = code_obj.mf_base58btc | ||
return canonical, human, decomposed, multiformat | ||
|
||
|
||
def chunk_text(text, chunk_size): | ||
original_chunk_size = idk.sdk_opts.text_avg_chunk_size | ||
idk.sdk_opts.text_avg_chunk_size = chunk_size | ||
cleaned = ic.text_clean(text) | ||
processed = idk.text_features(cleaned) | ||
features = processed["features"] | ||
sizes = processed["sizes"] | ||
start = 0 | ||
chunks = [] | ||
for size in sizes: | ||
end = start + size | ||
chunks.append(no_nl(cleaned[start:end])) | ||
start = end | ||
result = [ | ||
(chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features) | ||
] | ||
idk.sdk_opts.text_avg_chunk_size = original_chunk_size | ||
return result | ||
|
||
|
||
#################################################################################################### | ||
# TAB ISCC-CODE # | ||
#################################################################################################### | ||
|
||
with gr.Blocks() as demo_generate: | ||
gr.Markdown( | ||
""" | ||
## 🌟 ISCC-CODE Generator - The DNA of Digital Content | ||
""" | ||
) | ||
with gr.Row(): | ||
with gr.Column(scale=2): | ||
in_file = gr.File(label="Media File") | ||
with gr.Column(scale=1): | ||
out_thumbnail = gr.Image( | ||
label="Extracted Thumbnail", elem_classes=["fixed-height"] | ||
) | ||
with gr.Row(): | ||
out_iscc = gr.Text(label="ISCC-CODE", show_copy_button=True) | ||
with gr.Row(): | ||
out_meta = gr.Json(label="Metadata") | ||
in_file.change( | ||
generate_iscc, inputs=[in_file], outputs=[out_iscc, out_thumbnail, out_meta] | ||
) | ||
|
||
#################################################################################################### | ||
# TAB ENCODING # | ||
#################################################################################################### | ||
|
||
with gr.Blocks() as demo_decode: | ||
gr.Markdown( | ||
""" | ||
## 🌟 A Codec for Self-Describing Compact Binary Codes | ||
""" | ||
) | ||
with gr.Row(): | ||
with gr.Column(): | ||
in_iscc = gr.Text( | ||
label="ISCC", | ||
info="INPUT ANY VALID ISCC-CODE OR ISCC-UNIT", | ||
autofocus=True, | ||
) | ||
examples = [ | ||
"ISCC:AAAWN77F727NXSUS", # Meta-Code | ||
"bzqaqaal5rvp72lx2thvq", # Multiformat | ||
"ISCC:EAASKDNZNYGUUF5A", # Text-Code | ||
"ISCC:GABW5LUBVP23N3DOD7PPINHT5JKBI", # Data-Code 128 bits | ||
"ISCC:KUAG5LUBVP23N3DOHCHWIYGXVN7ZS", # ISCC-SUM | ||
"ISCC:KAA2Y5NUST7BFD5NN2XIDK7VW3WG4OEPMRQNPK37TE", # ISCC-CDI | ||
"z36hVxiqoF8AAmDpZV958hn3tsv2i7v1NfCrSzpq", # ISCC-CDI multiformats | ||
"ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI", | ||
] | ||
gr.Examples(label="Example ISCCs", examples=examples, inputs=[in_iscc]) | ||
|
||
gr.Markdown("## Different Encodings:") | ||
with gr.Row(): | ||
with gr.Column(): | ||
out_canonical = gr.Text( | ||
label="Canonical", | ||
info="NORMALIZED STANDARD REPRESENTATION", | ||
show_copy_button=True, | ||
) | ||
out_human = gr.Text( | ||
label="Human Readable", | ||
info="MAINTYPE - SUBTYPE - VERSION - LENGTH - BODY", | ||
show_copy_button=True, | ||
) | ||
out_decomposed = gr.Text( | ||
label="Decomposed", | ||
info="ISCC-UNITS", | ||
show_copy_button=True, | ||
) | ||
out_multiformat = gr.Text( | ||
label="Multiformat", | ||
info="BASE58-BTC", | ||
show_copy_button=True, | ||
) | ||
in_iscc.change( | ||
explain_iscc, | ||
inputs=[in_iscc], | ||
outputs=[ | ||
out_canonical, | ||
out_human, | ||
out_decomposed, | ||
out_multiformat, | ||
], | ||
) | ||
textarea { | ||
font-family: JetBrains Mono; | ||
} | ||
""" | ||
|
||
#################################################################################################### | ||
# CHUNKING # | ||
#################################################################################################### | ||
|
||
with gr.Blocks() as demo_chunking: | ||
gr.Markdown( | ||
""" | ||
## 🌟 Content Defined Chunking for Shift-Resistant Text and Data Segmentation | ||
""" | ||
) | ||
with gr.Row(): | ||
with gr.Column(): | ||
in_text = gr.Textbox(label="Text Input", lines=8, autofocus=True) | ||
in_chunksize = gr.Slider( | ||
label="Chunk Size", | ||
info="AVERAGE NUMBER OF CHARACTERS PER CHUNK", | ||
minimum=32, | ||
maximum=2048, | ||
step=32, | ||
value=64, | ||
) | ||
iscc_theme = gr.themes.Default( | ||
font=gr.themes.GoogleFont("Readex Pro"), | ||
font_mono=gr.themes.GoogleFont("JetBrains Mono"), | ||
radius_size=gr.themes.sizes.radius_none, | ||
) | ||
|
||
out_text = gr.HighlightedText( | ||
label="Chunked Text Output", | ||
interactive=False, | ||
elem_id="chunked-text", | ||
) | ||
in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text]) | ||
in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text]) | ||
|
||
demo = gr.TabbedInterface( | ||
title="▶️ ISCC Playground", | ||
interface_list=[demo_generate, demo_decode, demo_chunking], | ||
tab_names=["ISCC-CODE", "ENCODING", "CHUNKING"], | ||
title="▶️ ISCC Playground - The DNA of your digital content", | ||
interface_list=[demo_generate, demo_compare, demo_inspect, demo_chunker], | ||
tab_names=["GENERATE", "COMPARE", "INSPECT", "CHUNKER"], | ||
css=custom_css, | ||
theme=iscc_theme, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
demo.launch() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import gradio as gr | ||
import iscc_core as ic | ||
import iscc_sdk as idk | ||
import pathlib | ||
|
||
|
||
HERE = pathlib.Path(__file__).parent.absolute() | ||
SAMPLE_FILEPATH = HERE / "samples/sample.txt" | ||
sample_text = open(SAMPLE_FILEPATH, "rt", encoding="utf-8").read() | ||
|
||
newline_symbols = { | ||
"\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol | ||
"\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol | ||
"\u000c": "␌", # Form Feed - Unicode Control Pictures representation | ||
"\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol | ||
"\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol | ||
"\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol | ||
"\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol | ||
} | ||
|
||
custom_css = """ | ||
#chunked-text span.label { | ||
text-transform: none !important; | ||
} | ||
""" | ||
|
||
|
||
def no_nl(text): | ||
"""Replace non-printable newline characters with printable symbols""" | ||
for char, symbol in newline_symbols.items(): | ||
text = text.replace(char, symbol) | ||
return text | ||
|
||
|
||
def chunk_text(text, chunk_size): | ||
original_chunk_size = idk.sdk_opts.text_avg_chunk_size | ||
idk.sdk_opts.text_avg_chunk_size = chunk_size | ||
cleaned = ic.text_clean(text) | ||
processed = idk.text_features(cleaned) | ||
features = processed["features"] | ||
sizes = processed["sizes"] | ||
start = 0 | ||
chunks = [] | ||
for size in sizes: | ||
end = start + size | ||
chunks.append(no_nl(cleaned[start:end])) | ||
start = end | ||
result = [ | ||
(chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features) | ||
] | ||
idk.sdk_opts.text_avg_chunk_size = original_chunk_size | ||
return result | ||
|
||
|
||
with gr.Blocks(css=custom_css) as demo: | ||
with gr.Row(variant="panel"): | ||
gr.Markdown( | ||
""" | ||
## ✂️ ISCC Chunker | ||
Demo of Content-Defined Variable-Length Chunking for Shift-Resistant Text and Data Segmentation | ||
""", | ||
) | ||
with gr.Row(variant="panel"): | ||
with gr.Column(variant="panel"): | ||
in_text = gr.TextArea( | ||
label="Text Chunker", | ||
placeholder="Paste your text here", | ||
lines=12, | ||
max_lines=12, | ||
) | ||
in_chunksize = gr.Slider( | ||
label="Chunk Size", | ||
info="AVERAGE NUMBER OF CHARACTERS PER CHUNK", | ||
minimum=64, | ||
maximum=2048, | ||
step=32, | ||
value=64, | ||
) | ||
gr.Examples(label="Sample Text", examples=[sample_text], inputs=[in_text]) | ||
|
||
out_text = gr.HighlightedText( | ||
label="Chunked Text Output", | ||
interactive=False, | ||
elem_id="chunked-text", | ||
) | ||
with gr.Row(): | ||
gr.ClearButton(components=[in_text, in_chunksize, out_text]) | ||
with gr.Row(variant="panel"): | ||
gr.Markdown( | ||
""" | ||
## 📖 Help & Instructions | ||
This Demo showcases ISCC's shift-resistant chunking algorithm. Here's how to use it: | ||
A) **Paste your text** into the "Text Chunker" field or select the sample below. | ||
The **"Chunked Text Output"** will display the results, highlighting each chunk and its | ||
number of characters and associated similarity hash. | ||
B) Edit the text** in the "Text Chunker" field | ||
Observe how most chunks stay the same (same length and same hash) even if you make edits | ||
in the beginning of the text. | ||
C) **Adjust the "Chunk Size"** slider to control the average number of characters per chunk. | ||
Observe how the chunks get smaller/larger on average. Smaller sizes result in more, | ||
more fine grained chunks, while larger sizes produce fewer, larger chunks on average. | ||
D) Use the **Clear Button** to start over. | ||
For more information about ISCC chunking, please visit: https://core.iscc.codes/algorithms/cdc/ | ||
""", | ||
) | ||
|
||
gr.Markdown( | ||
""" | ||
## What is Content-Defined Chunking? | ||
This method segments text (or data) into chunks using a content-defined approach, which is | ||
resilient to shifts in the text. It ensures that changes in the beginning of the text have | ||
minimal impact on the chunk boundaries further in the text, making it ideal for version | ||
control, data deduplication, and similar applications where detecting content changes | ||
efficiently is crucial. | ||
## How does ISCC use Content-Defined Chunking? | ||
The [Data-Code](https://github.com/iscc/iscc-core/blob/main/iscc_core/code_data.py) is | ||
generated by chunking the raw file bitstream with an average chunk size of 1024 bytes. | ||
The chunks are hashed with `xxhash` and processed with a `minhash` algorithm. | ||
It is also used by the [iscc-sdk](https://github.com/iscc/iscc-sdk) to generate granular | ||
syntactic similarity hashes for textual content with an average chunk size of 1024 | ||
characters. When activated the granular chunk hashes are attached to the generated ISCC | ||
Metadata. | ||
""" | ||
) | ||
|
||
in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text]) | ||
in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text]) | ||
|
||
|
||
if __name__ == "__main__": | ||
demo.launch() |
Oops, something went wrong.