From d99d38f652e23bdb2fd888120d423a52d73cf8d5 Mon Sep 17 00:00:00 2001 From: Brian Lester Date: Mon, 24 Jun 2024 18:32:52 -0400 Subject: [PATCH] Update the tool to be able to search by example id and possibly title. --- licensed_pile/compare_data.py | 147 +++++++++++++++++++++++++++------- 1 file changed, 119 insertions(+), 28 deletions(-) diff --git a/licensed_pile/compare_data.py b/licensed_pile/compare_data.py index 07ba914..e57d5e0 100644 --- a/licensed_pile/compare_data.py +++ b/licensed_pile/compare_data.py @@ -1,5 +1,6 @@ """Compare pre and post processed dolma examples.""" +import collections import json import textwrap @@ -9,6 +10,8 @@ st.set_page_config(page_title="Compare", layout="wide") st.title("Compare different versions of data.") +Index = collections.namedtuple("Index", ["old", "new"]) + @st.cache_data def load_data(old, new): @@ -16,14 +19,21 @@ def load_data(old, new): old = [json.loads(l) for l in f if l] with smart_open.open(new) as f: new = [json.loads(l) for l in f if l] - return old, new - + old_idx = {o["id"]: i for i, o in enumerate(old)} + new_idx = {n["id"]: i for i, n in enumerate(new)} + # The ids aren't ordered so we can't to a posting merge like an inverted index + new_k = set(new_idx.keys()) + # Instead iterate though old idx and check if they live in the new, this + # will maintain the order as if we are looking line-by-line in the file while + # making sure that the two documents are aligned in the case of a record being + # deleted via preprocessing (we shouldn't see new ones get made). + both = [k for k in old_idx if k in new_k] + index = {k: Index(old_idx[k], new_idx[k]) for k in both} + # old: A list of examples + # new: A list of examples + # index: A Dict mapping id: str -> (position in old, position in new) + return old, new, index -if "index" not in st.session_state: - st.session_state.index = 0 -# Don't set this here, as it will be set with the value of the number input. -# if "width" not in st.session_state: -# st.session_state.width = 88 config = st.expander("config") @@ -33,43 +43,108 @@ def load_data(old, new): data_load_state = st.text(f"Loading data from:\n\t{old_path}\n\t{new_path}") - old, new = load_data(old_path, new_path) + old, new, index = load_data(old_path, new_path) + # When paging though examples by position (next/prev) this maps the position + # to the id of an example + by_position = list(index.keys()) + # When you jump to a given id, this can be used to find the logical position + # of that example as if you hit next/back a bunch. Note: This position is + # not related to the position in the old/new example list. + by_id = {k: i for i, k in enumerate(by_position)} + # Create a mapping from ids to title for easier exploration. + id_to_title = { + id: title + for id, idx in index.items() + if (title := new[idx.new].get("metadata", {}).get("title")) + } + title_to_id = {t: i for i, t in id_to_title.items()} data_load_state.text(f"Loaded {len(old)} examples") + # Display Configuration wrap_width = st.number_input("Wrap Width:", value=88, key="width") - to_wrap = st.checkbox("Wrap?", value=True) + container_height = st.number_input("Text Hight:", value=500, key="height") + +if "index" not in st.session_state: + st.session_state.index = 0 +if "id" not in st.session_state: + st.session_state.id = by_position[st.session_state.index] +# Don't set this here, as it will be set with the value of the number input. +# if "width" not in st.session_state: +# st.session_state.width = 88 +# if "height" not in st.session_state: +# st.session_state.width = 500 def update_index(i): + # Don't go outside the bounds. + if st.session_state.index == 0 and i < 0: + return + if st.session_state.index == len(by_position) - 1 and i > 0: + return + # We hit next/prev, so update the position. st.session_state.index += i + # Now convert that position into an id + st.session_state.id = by_position[st.session_state.index] + if id_to_title: + st.session_state.title = id_to_title[st.session_state.id] -def set_index(i): - st.session_state.index = i +def fix_by_id(): + # When the id is updated by a widget, make sure the index is updated to the + # correct position. + st.session_state.index = by_id[st.session_state.id] + if id_to_title: + st.session_state.title = id_to_title[st.session_state.id] -b1, b2 = st.columns(2) +def fix_by_index(): + # When the position is updated by a widget, make sure the id is updated too. + st.session_state.id = by_position[st.session_state.index] + if id_to_title: + st.session_state.title = id_to_title[st.session_state.id] + + +def fix_by_title(): + if id_to_title: + id = title_to_id[st.session_state.title] + st.session_state.id = id + st.session_state.index = by_id[st.session_state.id] + +# Display the controls +b1, b2 = st.columns(2) +# Previous and Next Buttons with b1: - p, n = st.columns(2) - with p: - st.button("prev", on_click=update_index, args=[-1]) - with n: - st.button("next", on_click=update_index, args=[1]) + st.button("prev", on_click=update_index, args=[-1]) + st.button("next", on_click=update_index, args=[1]) +# Jump around widgets with b2: index_input = st.number_input( - "Index:", min_value=0, max_value=len(old), key="index" + "Index:", + min_value=0, + max_value=len(by_position) - 1, + on_change=fix_by_index, + key="index", ) - -old_col, new_col = st.columns(2) + id_input = st.selectbox("Id:", options=by_position, on_change=fix_by_id, key="id") + if id_to_title: + title_input = st.selectbox( + "Title:", options=title_to_id.keys(), on_change=fix_by_title, key="title" + ) def wrap(text, width=88): + r"""Do a word wrap that respects previous newlines. + + This is done by splitting on \n first and then wrapping each line individually. + """ lines = text.split("\n") new_lines = [] for line in lines: + # textwrap seems to remove empty lines, even with setting the whitespace + # args to False, this check lets us preserve them. if line: new_lines.extend( textwrap.wrap( @@ -78,24 +153,40 @@ def wrap(text, width=88): ) else: new_lines.append("") - return "\n".join(new_lines) + return "\n".join(map(str.strip, new_lines)) + +# Display the examples +old_col, new_col = st.columns(2) with old_col: st.subheader("Old Text") - with st.container(height=500): + # Creating a container sets the height of it, this forces a scroll wheel that + # /only/ moves the text in this box. This makes it easy to scroll the two + # examples independently and align related sections. + with st.container(height=st.session_state.height): + # Get the text from the list based on the id -> position mapping, not the + # position of the cursor of the "next/prev" buttons. + text = old[index[st.session_state.id].old]["text"] + # Use st.text as `st.write` and `st.markdown` use markdown rules, removing + # single newlines and only counting doubles as new paragraphs. Text lets us + # keeep these newlines, but it the reason we needed our own wrap function. if to_wrap: - st.text(wrap(old[st.session_state.index]["text"], st.session_state.width)) + st.text(wrap(text, st.session_state.width)) else: - st.text(old[st.session_state.index]["text"]) + st.text(text) +# Same comments as above, but we index into the list of /new/ examples. with new_col: st.subheader("New Text") - with st.container(height=500): + with st.container(height=st.session_state.height): + text = new[index[st.session_state.id].new]["text"] if to_wrap: - st.text(wrap(new[st.session_state.index]["text"], st.session_state.width)) + st.text(wrap(text, st.session_state.width)) else: - st.text(new[st.session_state.index]["text"]) + st.text(text) +# Show the metadata for the example. We don't expect it to change much so we just +# show it for the new version. st.header("Metadata") -st.json(old[st.session_state.index]["metadata"], expanded=False) +st.json(old[index[st.session_state.id].new]["metadata"], expanded=False)