Skip to content

Commit

Permalink
Brought pre-commit to pass
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbraza committed Jun 26, 2024
1 parent 324ddb7 commit c8f3727
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 57 deletions.
32 changes: 16 additions & 16 deletions paperqa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,29 @@
from .version import __version__

__all__ = [
"Docs",
"Answer",
"PromptCollection",
"__version__",
"Doc",
"Text",
"AnthropicLLMModel",
"Context",
"LLMModel",
"Doc",
"Docs",
"EmbeddingModel",
"OpenAIEmbeddingModel",
"HybridEmbeddingModel",
"SparseEmbeddingModel",
"OpenAILLMModel",
"AnthropicLLMModel",
"LLMModel",
"LLMResult",
"LangchainEmbeddingModel",
"LangchainLLMModel",
"LangchainVectorStore",
"LlamaEmbeddingModel",
"SentenceTransformerEmbeddingModel",
"LangchainEmbeddingModel",
"NumpyVectorStore",
"LangchainVectorStore",
"OpenAIEmbeddingModel",
"OpenAILLMModel",
"PromptCollection",
"SentenceTransformerEmbeddingModel",
"SparseEmbeddingModel",
"Text",
"__version__",
"embedding_model_factory",
"llm_model_factory",
"print_callback",
"LLMResult",
"vector_store_factory",
"llm_model_factory",
"embedding_model_factory",
]
39 changes: 20 additions & 19 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def config_summary_llm_config(cls, data: Any) -> Any:
if (
data.summary_llm_model is None
and data.llm == "default"
and type(data.llm_model) == OpenAILLMModel
and isinstance(data.llm_model, OpenAILLMModel)
):
data.summary_llm_model = OpenAILLMModel(
config={"model": "gpt-3.5-turbo", "temperature": 0.1}
Expand Down Expand Up @@ -530,8 +530,8 @@ async def adoc_match(
if (
rerank is None
and (
type(self.llm_model) == OpenAILLMModel
and cast(OpenAILLMModel, self).config["model"].startswith("gpt-4")
isinstance(self.llm_model, OpenAILLMModel)
and self.config["model"].startswith("gpt-4")
)
or rerank is True
):
Expand Down Expand Up @@ -703,33 +703,34 @@ async def process(match): # noqa: C901, PLR0912
if self.strip_citations:
# remove citations that collide with our grounded citations (for the answer LLM)
context = strip_citations(context)
c = Context(
context=context,
text=Text(
text=match.text,
name=match.name,
doc=match.doc.__class__(
**match.doc.model_dump(exclude="embedding")
return (
Context(
context=context,
text=Text(
text=match.text,
name=match.name,
doc=match.doc.__class__(
**match.doc.model_dump(exclude="embedding")
),
),
score=score,
**extras,
),
score=score,
**extras,
llm_result,
)
return c, llm_result

results = await gather_with_concurrency(
self.max_concurrent, [process(m) for m in matches]
)
# update token counts
[answer.add_tokens(r[1]) for r in results]

# filter out failures
contexts = [c for c, r in results if c is not None]

# filter out failures, sort by score, limit to max_sources
answer.contexts = sorted(
contexts + answer.contexts, key=lambda x: x.score, reverse=True
)
answer.contexts = answer.contexts[:max_sources]
[c for c, r in results if c is not None] + answer.contexts,
key=lambda x: x.score,
reverse=True,
)[:max_sources]
context_str = "\n\n".join(
[
f"{c.text.name}: {c.context}"
Expand Down
14 changes: 7 additions & 7 deletions paperqa/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,11 @@ def guess_model_type(model_name: str) -> str: # noqa: PLR0911
def is_anyscale_model(model_name: str) -> bool:
# compares prefixes with anyscale models
# https://docs.anyscale.com/endpoints/text-generation/query-a-model/
if (
return bool(
os.environ.get("ANYSCALE_API_KEY")
and os.environ.get("ANYSCALE_BASE_URL")
and model_name.startswith(ANYSCALE_MODEL_PREFIXES)
):
return True
return False
)


def is_openai_model(model_name: str) -> bool:
Expand Down Expand Up @@ -296,8 +294,8 @@ async def execute(
)
result.prompt = messages
result.prompt_count = sum(
[self.count_tokens(m["content"]) for m in messages]
) + sum([self.count_tokens(m["role"]) for m in messages])
self.count_tokens(m["content"]) for m in messages
) + sum(self.count_tokens(m["role"]) for m in messages)

if callbacks is None:
output = await self.achat(client, messages)
Expand Down Expand Up @@ -612,14 +610,16 @@ async def similarity_search(
def clear(self) -> None:
pass

async def max_marginal_relevance_search( # noqa: D417
async def max_marginal_relevance_search(
self, client: Any, query: str, k: int, fetch_k: int
) -> tuple[Sequence[Embeddable], list[float]]:
"""Vectorized implementation of Maximal Marginal Relevance (MMR) search.
Args:
client: TODOC.
query: Query vector.
k: Number of results to return.
fetch_k: Number of results to fetch from the vector store.
Returns:
List of tuples (doc, score) of length k.
Expand Down
4 changes: 2 additions & 2 deletions paperqa/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def parse_text(
"parsing_libraries": ["tiktoken (cl100k_base)"] if use_tiktoken else [],
"paperqa_version": str(pqa_version),
"total_parsed_text_length": (
len(text) if isinstance(text, str) else sum([len(t) for t in text])
len(text) if isinstance(text, str) else sum(len(t) for t in text)
),
"parse_type": "txt" if not html else "html",
}
Expand Down Expand Up @@ -254,7 +254,7 @@ def read_doc(
) -> tuple[list[Text], ParsedMetadata]: ...


def read_doc( # noqa: PLR0912
def read_doc(
path: Path,
doc: Doc,
parsed_text_only: bool = False,
Expand Down
10 changes: 3 additions & 7 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@
def name_in_text(name: str, text: str) -> bool:
sname = name.strip()
pattern = rf"\b({re.escape(sname)})\b(?!\w)"
if re.search(pattern, text):
return True
return False
return bool(re.search(pattern, text))


def maybe_is_text(s: str, thresh: float = 2.5) -> bool:
Expand All @@ -33,9 +31,7 @@ def maybe_is_text(s: str, thresh: float = 2.5) -> bool:
entropy += -p * math.log2(p)

# Check if the entropy is within a reasonable range for text
if entropy > thresh:
return True
return False
return entropy > thresh


def maybe_is_pdf(file: BinaryIO) -> bool:
Expand All @@ -47,7 +43,7 @@ def maybe_is_pdf(file: BinaryIO) -> bool:
def maybe_is_html(file: BinaryIO) -> bool:
magic_number = file.read(4)
file.seek(0)
return magic_number in (b"<htm", b"<!DO", b"<xsl", b"<!X")
return magic_number in {b"<htm", b"<!DO", b"<xsl", b"<!X"}


def strings_similarity(s1: str, s2: str) -> float:
Expand Down
11 changes: 5 additions & 6 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ def test_langchain_llm():
assert docs2.summary_llm == "babbage-002"
docs2.get_evidence(
Answer(question="What is Frederick Bates's greatest accomplishment?"),
get_callbacks=lambda x: [lambda y: print(y)], # noqa: ARG005
get_callbacks=lambda x: [print], # noqa: ARG005
)


Expand Down Expand Up @@ -1117,8 +1117,7 @@ async def test_langchain_vector_store():
index.add_texts_and_embeddings(some_texts)
assert index._store is not None
# check search returns Text obj
data, score = await index.similarity_search(None, "test", k=1) # type: ignore[unreachable]
print(data)
data, _ = await index.similarity_search(None, "test", k=1) # type: ignore[unreachable]
assert isinstance(data[0], Text)

# now try with convenience
Expand Down Expand Up @@ -1386,7 +1385,7 @@ def test_parser_only_reader():
assert any("pypdf" in t for t in parsed_text.metadata.parsing_libraries)
assert parsed_text.metadata.chunk_metadata is None
assert parsed_text.metadata.total_parsed_text_length == sum(
[len(t) for t in parsed_text.content.values()] # type: ignore[misc,union-attr]
len(t) for t in parsed_text.content.values() # type: ignore[misc,union-attr]
)


Expand Down Expand Up @@ -1491,12 +1490,12 @@ def test_citation():
f.write(r.text)
docs = Docs()
docs.add(doc_path) # type: ignore[arg-type]
assert next(iter(docs.docs.values())).docname in (
assert next(iter(docs.docs.values())).docname in {
"Wikipedia2024",
"Frederick2024",
"Wikipedia",
"Frederick",
)
}


def test_dockey_filter():
Expand Down

0 comments on commit c8f3727

Please sign in to comment.