Refactor into separate demos, add comparison demo

iscc · Feb 17, 2024 · beba2a9 · beba2a9
1 parent 65accba
commit beba2a9
Show file tree

Hide file tree

Showing 10 changed files with 1,179 additions and 1,122 deletions.
diff --git a/app.py b/app.py
@@ -1,207 +1,48 @@
-import io
-import base64
 import gradio as gr
-import iscc_core as ic
-import iscc_sdk as idk
-from PIL import Image
-
-idk.sdk_opts.image_thumbnail_size = 265
-idk.sdk_opts.image_thumbnail_quality = 80
-
+from demos.generate import demo as demo_generate
+from demos.compare import demo as demo_compare
+from demos.inspect_ import demo as demo_inspect
+from demos.chunker import demo as demo_chunker
 
 custom_css = """
-.fixed-height img {
-    height: 265px;  /* Fixed height */
+.fixed-height {
+    height: 240px;  /* Fixed height */
     object-fit: contain;  /* Scale the image to fit within the element */
 }
 #chunked-text span.label {
     text-transform: none !important;
 }
-"""
-
-newline_symbols = {
-    "\u000a": "⏎",  # Line Feed - Represented by the 'Return' symbol
-    "\u000b": "↨",  # Vertical Tab - Represented by the 'Up Down Arrow' symbol
-    "\u000c": "␌",  # Form Feed - Unicode Control Pictures representation
-    "\u000d": "↵",  # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
-    "\u0085": "⤓",  # Next Line - 'Downwards Arrow with Double Stroke' symbol
-    "\u2028": "↲",  # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
-    "\u2029": "¶",  # Paragraph Separator - Represented by the 'Pilcrow' symbol
+.json-holder {
+    word-wrap: break-word;
+    white-space: pre-wrap;
 }
 
+#examples-a, #examples-b {
+    height: 140px;  /* Fixed height */
+    object-fit: contain;  /* Scale the image to fit within the element */
+}
 
-def no_nl(text):
-    for char, symbol in newline_symbols.items():
-        text = text.replace(char, symbol)
-    return text
-
-
-def generate_iscc(file):
-    imeta = idk.code_iscc(file.name)
-    thumbnail = None
-    if imeta.thumbnail:
-        header, encoded = imeta.thumbnail.split(",", 1)
-        data = base64.b64decode(encoded)
-        thumbnail = Image.open(io.BytesIO(data))
-    metadata = imeta.dict(exclude_unset=False, by_alias=True)
-    if metadata.get("thumbnail"):
-        del metadata["thumbnail"]
-    return imeta.iscc, thumbnail, metadata
-
-
-def explain_iscc(code):
-    canonical = ic.iscc_normalize(code)
-    human = " - ".join(ic.iscc_explain(code).split("-"))
-    code_obj = ic.Code(canonical)
-    decomposed = " - ".join(ic.iscc_decompose(canonical))
-    multiformat = code_obj.mf_base58btc
-    return canonical, human, decomposed, multiformat
-
-
-def chunk_text(text, chunk_size):
-    original_chunk_size = idk.sdk_opts.text_avg_chunk_size
-    idk.sdk_opts.text_avg_chunk_size = chunk_size
-    cleaned = ic.text_clean(text)
-    processed = idk.text_features(cleaned)
-    features = processed["features"]
-    sizes = processed["sizes"]
-    start = 0
-    chunks = []
-    for size in sizes:
-        end = start + size
-        chunks.append(no_nl(cleaned[start:end]))
-        start = end
-    result = [
-        (chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)
-    ]
-    idk.sdk_opts.text_avg_chunk_size = original_chunk_size
-    return result
-
-
-####################################################################################################
-# TAB ISCC-CODE                                                                                    #
-####################################################################################################
-
-with gr.Blocks() as demo_generate:
-    gr.Markdown(
-        """
-    ## 🌟 ISCC-CODE Generator - The DNA of Digital Content
-    """
-    )
-    with gr.Row():
-        with gr.Column(scale=2):
-            in_file = gr.File(label="Media File")
-        with gr.Column(scale=1):
-            out_thumbnail = gr.Image(
-                label="Extracted Thumbnail", elem_classes=["fixed-height"]
-            )
-    with gr.Row():
-        out_iscc = gr.Text(label="ISCC-CODE", show_copy_button=True)
-    with gr.Row():
-        out_meta = gr.Json(label="Metadata")
-    in_file.change(
-        generate_iscc, inputs=[in_file], outputs=[out_iscc, out_thumbnail, out_meta]
-    )
-
-####################################################################################################
-# TAB ENCODING                                                                                     #
-####################################################################################################
-
-with gr.Blocks() as demo_decode:
-    gr.Markdown(
-        """
-    ## 🌟 A Codec for Self-Describing Compact Binary Codes
-    """
-    )
-    with gr.Row():
-        with gr.Column():
-            in_iscc = gr.Text(
-                label="ISCC",
-                info="INPUT ANY VALID ISCC-CODE OR ISCC-UNIT",
-                autofocus=True,
-            )
-            examples = [
-                "ISCC:AAAWN77F727NXSUS",  # Meta-Code
-                "bzqaqaal5rvp72lx2thvq",  # Multiformat
-                "ISCC:EAASKDNZNYGUUF5A",  # Text-Code
-                "ISCC:GABW5LUBVP23N3DOD7PPINHT5JKBI",  # Data-Code 128 bits
-                "ISCC:KUAG5LUBVP23N3DOHCHWIYGXVN7ZS",  # ISCC-SUM
-                "ISCC:KAA2Y5NUST7BFD5NN2XIDK7VW3WG4OEPMRQNPK37TE",  # ISCC-CDI
-                "z36hVxiqoF8AAmDpZV958hn3tsv2i7v1NfCrSzpq",  # ISCC-CDI multiformats
-                "ISCC:KACT4EBWK27737D2AYCJRAL5Z36G76RFRMO4554RU26HZ4ORJGIVHDI",
-            ]
-            gr.Examples(label="Example ISCCs", examples=examples, inputs=[in_iscc])
-
-    gr.Markdown("## Different Encodings:")
-    with gr.Row():
-        with gr.Column():
-            out_canonical = gr.Text(
-                label="Canonical",
-                info="NORMALIZED STANDARD REPRESENTATION",
-                show_copy_button=True,
-            )
-            out_human = gr.Text(
-                label="Human Readable",
-                info="MAINTYPE - SUBTYPE - VERSION - LENGTH - BODY",
-                show_copy_button=True,
-            )
-            out_decomposed = gr.Text(
-                label="Decomposed",
-                info="ISCC-UNITS",
-                show_copy_button=True,
-            )
-            out_multiformat = gr.Text(
-                label="Multiformat",
-                info="BASE58-BTC",
-                show_copy_button=True,
-            )
-    in_iscc.change(
-        explain_iscc,
-        inputs=[in_iscc],
-        outputs=[
-            out_canonical,
-            out_human,
-            out_decomposed,
-            out_multiformat,
-        ],
-    )
+textarea {
+    font-family: JetBrains Mono;
+}
+"""
 
-####################################################################################################
-# CHUNKING                                                                                         #
-####################################################################################################
 
-with gr.Blocks() as demo_chunking:
-    gr.Markdown(
-        """
-    ## 🌟 Content Defined Chunking for Shift-Resistant Text and Data Segmentation
-    """
-    )
-    with gr.Row():
-        with gr.Column():
-            in_text = gr.Textbox(label="Text Input", lines=8, autofocus=True)
-            in_chunksize = gr.Slider(
-                label="Chunk Size",
-                info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
-                minimum=32,
-                maximum=2048,
-                step=32,
-                value=64,
-            )
+iscc_theme = gr.themes.Default(
+    font=gr.themes.GoogleFont("Readex Pro"),
+    font_mono=gr.themes.GoogleFont("JetBrains Mono"),
+    radius_size=gr.themes.sizes.radius_none,
+)
 
-        out_text = gr.HighlightedText(
-            label="Chunked Text Output",
-            interactive=False,
-            elem_id="chunked-text",
-        )
-    in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
-    in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
 
 demo = gr.TabbedInterface(
-    title="▶️ ISCC Playground",
-    interface_list=[demo_generate, demo_decode, demo_chunking],
-    tab_names=["ISCC-CODE", "ENCODING", "CHUNKING"],
+    title="▶️ ISCC Playground - The DNA of your digital content",
+    interface_list=[demo_generate, demo_compare, demo_inspect, demo_chunker],
+    tab_names=["GENERATE", "COMPARE", "INSPECT", "CHUNKER"],
     css=custom_css,
+    theme=iscc_theme,
 )
 
+
 if __name__ == "__main__":
     demo.launch()
diff --git a/demos/__init__.py b/demos/__init__.py
diff --git a/demos/chunker.py b/demos/chunker.py
@@ -0,0 +1,144 @@
+import gradio as gr
+import iscc_core as ic
+import iscc_sdk as idk
+import pathlib
+
+
+HERE = pathlib.Path(__file__).parent.absolute()
+SAMPLE_FILEPATH = HERE / "samples/sample.txt"
+sample_text = open(SAMPLE_FILEPATH, "rt", encoding="utf-8").read()
+
+newline_symbols = {
+    "\u000a": "⏎",  # Line Feed - Represented by the 'Return' symbol
+    "\u000b": "↨",  # Vertical Tab - Represented by the 'Up Down Arrow' symbol
+    "\u000c": "␌",  # Form Feed - Unicode Control Pictures representation
+    "\u000d": "↵",  # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
+    "\u0085": "⤓",  # Next Line - 'Downwards Arrow with Double Stroke' symbol
+    "\u2028": "↲",  # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
+    "\u2029": "¶",  # Paragraph Separator - Represented by the 'Pilcrow' symbol
+}
+
+custom_css = """
+#chunked-text span.label {
+    text-transform: none !important;
+}
+"""
+
+
+def no_nl(text):
+    """Replace non-printable newline characters with printable symbols"""
+    for char, symbol in newline_symbols.items():
+        text = text.replace(char, symbol)
+    return text
+
+
+def chunk_text(text, chunk_size):
+    original_chunk_size = idk.sdk_opts.text_avg_chunk_size
+    idk.sdk_opts.text_avg_chunk_size = chunk_size
+    cleaned = ic.text_clean(text)
+    processed = idk.text_features(cleaned)
+    features = processed["features"]
+    sizes = processed["sizes"]
+    start = 0
+    chunks = []
+    for size in sizes:
+        end = start + size
+        chunks.append(no_nl(cleaned[start:end]))
+        start = end
+    result = [
+        (chunk, f"{size}:{feat}") for chunk, size, feat in zip(chunks, sizes, features)
+    ]
+    idk.sdk_opts.text_avg_chunk_size = original_chunk_size
+    return result
+
+
+with gr.Blocks(css=custom_css) as demo:
+    with gr.Row(variant="panel"):
+        gr.Markdown(
+            """
+        ## ✂️ ISCC Chunker
+        Demo of Content-Defined Variable-Length Chunking for Shift-Resistant Text and Data Segmentation
+        """,
+        )
+    with gr.Row(variant="panel"):
+        with gr.Column(variant="panel"):
+            in_text = gr.TextArea(
+                label="Text Chunker",
+                placeholder="Paste your text here",
+                lines=12,
+                max_lines=12,
+            )
+            in_chunksize = gr.Slider(
+                label="Chunk Size",
+                info="AVERAGE NUMBER OF CHARACTERS PER CHUNK",
+                minimum=64,
+                maximum=2048,
+                step=32,
+                value=64,
+            )
+            gr.Examples(label="Sample Text", examples=[sample_text], inputs=[in_text])
+
+        out_text = gr.HighlightedText(
+            label="Chunked Text Output",
+            interactive=False,
+            elem_id="chunked-text",
+        )
+    with gr.Row():
+        gr.ClearButton(components=[in_text, in_chunksize, out_text])
+    with gr.Row(variant="panel"):
+        gr.Markdown(
+            """
+        ## 📖 Help & Instructions
+
+        This Demo showcases ISCC's shift-resistant chunking algorithm. Here's how to use it:
+
+        A) **Paste your text** into the "Text Chunker" field or select the sample below.
+
+        The **"Chunked Text Output"** will display the results, highlighting each chunk and its
+        number of characters and associated similarity hash.
+
+        B) Edit the text** in the "Text Chunker" field
+
+        Observe how most chunks stay the same (same length and same hash) even if you make edits
+        in the beginning of the text.
+
+        C) **Adjust the "Chunk Size"** slider to control the average number of characters per chunk.
+
+        Observe how the chunks get smaller/larger on average. Smaller sizes result in more,
+        more fine grained chunks, while larger sizes produce fewer, larger chunks on average.
+
+        D) Use the **Clear Button** to start over.
+
+        For more information about ISCC chunking, please visit: https://core.iscc.codes/algorithms/cdc/
+        """,
+        )
+
+        gr.Markdown(
+            """
+        ## What is Content-Defined Chunking?
+
+        This method segments text (or data) into chunks using a content-defined approach, which is
+        resilient to shifts in the text. It ensures that changes in the beginning of the text have
+        minimal impact on the chunk boundaries further in the text, making it ideal for version
+        control, data deduplication, and similar applications where detecting content changes
+        efficiently is crucial.
+
+        ## How does ISCC use Content-Defined Chunking?
+
+        The [Data-Code](https://github.com/iscc/iscc-core/blob/main/iscc_core/code_data.py) is
+        generated by chunking the raw file bitstream with an average chunk size of 1024 bytes.
+        The chunks are hashed with `xxhash` and processed with a `minhash` algorithm.
+
+        It is also used by the [iscc-sdk](https://github.com/iscc/iscc-sdk) to generate granular
+        syntactic similarity hashes for textual content with an average chunk size of 1024
+        characters. When activated the granular chunk hashes are attached to the generated ISCC
+        Metadata.
+        """
+        )
+
+    in_text.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
+    in_chunksize.change(chunk_text, inputs=[in_text, in_chunksize], outputs=[out_text])
+
+
+if __name__ == "__main__":
+    demo.launch()