From 865e76a0832278507a959638967c8ae3348c7c33 Mon Sep 17 00:00:00 2001 From: Magic_yuan <317617749@qq.com> Date: Mon, 9 Dec 2024 15:08:30 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbug=20https://github.com/HKUD?= =?UTF-8?q?S/LightRAG/issues/306=20=E4=B8=BB=E8=A6=81=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E5=8C=85=E6=8B=AC=EF=BC=9A=20=E5=9C=A8=E5=AD=98=E5=82=A8?= =?UTF-8?q?=E6=96=87=E6=9C=AC=E5=9D=97=E6=95=B0=E6=8D=AE=E6=97=B6=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E4=BA=86=E9=AA=8C=E8=AF=81=EF=BC=8C=E7=A1=AE=E4=BF=9D?= =?UTF-8?q?=E5=8F=AA=E5=AD=98=E5=82=A8=E6=9C=89=E6=95=88=E7=9A=84=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=20=E5=9C=A8=E5=A4=84=E7=90=86=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E5=9D=97=E4=B9=8B=E5=89=8D=E5=A2=9E=E5=8A=A0=E4=BA=86=E7=A9=BA?= =?UTF-8?q?=E5=88=97=E8=A1=A8=E6=A3=80=E6=9F=A5=20=E5=9C=A8=E6=88=AA?= =?UTF-8?q?=E6=96=AD=E6=96=87=E6=9C=AC=E5=9D=97=E4=B9=8B=E5=89=8D=E8=BF=87?= =?UTF-8?q?=E6=BB=A4=E6=8E=89=E6=97=A0=E6=95=88=E7=9A=84=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=20=E5=A2=9E=E5=8A=A0=E4=BA=86=E6=9B=B4=E5=A4=9A=E7=9A=84?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E8=AD=A6=E5=91=8A=E4=BF=A1=E6=81=AF=20?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2=E7=9A=84=E4=BF=AE=E6=94=B9=EF=BC=9A=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86=E5=AF=B9=20chunks=20=E7=9A=84?= =?UTF-8?q?=E6=9C=89=E6=95=88=E6=80=A7=E6=A3=80=E6=9F=A5,=E8=BF=87?= =?UTF-8?q?=E6=BB=A4=E6=8E=89=E6=97=A0=E6=95=88=E7=9A=84=20chunks:?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lightrag/operate.py | 59 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/lightrag/operate.py b/lightrag/operate.py index feaec27d..45c9ef16 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -990,23 +990,37 @@ async def _find_related_text_unit_from_relationships( for index, unit_list in enumerate(text_units): for c_id in unit_list: if c_id not in all_text_units_lookup: - all_text_units_lookup[c_id] = { - "data": await text_chunks_db.get_by_id(c_id), - "order": index, - } + chunk_data = await text_chunks_db.get_by_id(c_id) + # Only store valid data + if chunk_data is not None and "content" in chunk_data: + all_text_units_lookup[c_id] = { + "data": chunk_data, + "order": index, + } + + if not all_text_units_lookup: + logger.warning("No valid text chunks found") + return [] - if any([v is None for v in all_text_units_lookup.values()]): - logger.warning("Text chunks are missing, maybe the storage is damaged") - all_text_units = [ - {"id": k, **v} for k, v in all_text_units_lookup.items() if v is not None - ] + all_text_units = [{"id": k, **v} for k, v in all_text_units_lookup.items()] all_text_units = sorted(all_text_units, key=lambda x: x["order"]) - all_text_units = truncate_list_by_token_size( - all_text_units, + + # Ensure all text chunks have content + valid_text_units = [ + t for t in all_text_units if t["data"] is not None and "content" in t["data"] + ] + + if not valid_text_units: + logger.warning("No valid text chunks after filtering") + return [] + + truncated_text_units = truncate_list_by_token_size( + valid_text_units, key=lambda x: x["data"]["content"], max_token_size=query_param.max_token_for_text_unit, ) - all_text_units: list[TextChunkSchema] = [t["data"] for t in all_text_units] + + all_text_units: list[TextChunkSchema] = [t["data"] for t in truncated_text_units] return all_text_units @@ -1050,24 +1064,43 @@ async def naive_query( results = await chunks_vdb.query(query, top_k=query_param.top_k) if not len(results): return PROMPTS["fail_response"] + chunks_ids = [r["id"] for r in results] chunks = await text_chunks_db.get_by_ids(chunks_ids) + # Filter out invalid chunks + valid_chunks = [ + chunk for chunk in chunks if chunk is not None and "content" in chunk + ] + + if not valid_chunks: + logger.warning("No valid chunks found after filtering") + return PROMPTS["fail_response"] + maybe_trun_chunks = truncate_list_by_token_size( - chunks, + valid_chunks, key=lambda x: x["content"], max_token_size=query_param.max_token_for_text_unit, ) + + if not maybe_trun_chunks: + logger.warning("No chunks left after truncation") + return PROMPTS["fail_response"] + logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks") section = "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks]) + if query_param.only_need_context: return section + sys_prompt_temp = PROMPTS["naive_rag_response"] sys_prompt = sys_prompt_temp.format( content_data=section, response_type=query_param.response_type ) + if query_param.only_need_prompt: return sys_prompt + response = await use_model_func( query, system_prompt=sys_prompt,