Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/rationalise file name #1127

Merged
merged 21 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion django_app/redbox_app/redbox_core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def reupload(self, _request, queryset):
async_task(ingest, file.id)
logger.info("Successfully reuploaded file %s.", file)

list_display = ["original_file_name", "user", "status", "created_at", "last_referenced"]
list_display = ["file_name", "user", "status", "created_at", "last_referenced"]
list_filter = ["user", "status"]
date_hierarchy = "created_at"
actions = ["reupload"]
Expand Down
14 changes: 7 additions & 7 deletions django_app/redbox_app/redbox_core/consumers.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ async def llm_conversation(
state = RedboxState(
request=RedboxQuery(
question=message_history[-1].text,
s3_keys=[f.unique_name for f in selected_files],
s3_keys=[f.s3_key for f in selected_files],
user_uuid=user.id,
chat_history=[
ChainChatMessage(
Expand All @@ -139,7 +139,7 @@ async def llm_conversation(
for message in message_history[:-1]
],
ai_settings=ai_settings,
permitted_s3_keys=[f.unique_name async for f in permitted_files],
permitted_s3_keys=[f.s3_key async for f in permitted_files],
),
)

Expand Down Expand Up @@ -273,20 +273,20 @@ async def handle_documents(self, response: list[Document]):
for ref, sources in sources_by_resource_ref.items():
try:
file = await File.objects.aget(original_file=ref)
payload = {"url": str(file.url), "original_file_name": file.original_file_name}
payload = {"url": str(file.url), "file_name": file.file_name}
response_sources = [
Source(
source=str(file.url),
source_type=Citation.Origin.USER_UPLOADED_DOCUMENT,
document_name=file.original_file_name,
document_name=file.file_name,
highlighted_text_in_source=cited_chunk.page_content,
page_numbers=parse_page_number(cited_chunk.metadata.get("page_number")),
)
for cited_chunk in sources
]
except File.DoesNotExist:
file = None
payload = {"url": ref, "original_file_name": None}
payload = {"url": ref, "file_name": None}
response_sources = [
Source(
source=cited_chunk.metadata["uri"],
Expand All @@ -307,10 +307,10 @@ async def handle_citations(self, citations: list[AICitation]):
for s in c.sources:
try:
file = await File.objects.aget(original_file=s.source)
payload = {"url": str(file.url), "original_file_name": file.original_file_name}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we still want to use original_file_name here so we see 'stuff.pdf' in the frontend rather than 'me@cabinetoffice/stuff.pdf'?

Copy link
Collaborator Author

@gecBurton gecBurton Oct 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file_name will strip out me@cabinetoffice/ and just return stuff.pdf

TBH i was hoping that we could just delete original_file_name altogether in 30 days (this PR means that its no longer getting populated)

payload = {"url": str(file.url), "file_name": file.file_name}
except File.DoesNotExist:
file = None
payload = {"url": s.source, "original_file_name": s.document_name}
payload = {"url": s.source, "file_name": s.document_name}
await self.send_to_client("source", payload)
self.citations.append((file, s))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def handle(self, *_args, **kwargs):
for file in File.objects.exclude(status__in=INACTIVE_STATUSES):
logger.debug("Reingesting file object %s", file)
async_task(
ingest, file.id, new_index, task_name=file.original_file.name, group="re-ingest", sync=kwargs["sync"]
ingest,
file.id,
new_index,
task_name=file.file_name,
group="re-ingest",
sync=kwargs["sync"],
)
async_task(switch_aliases, env.elastic_chunk_alias, new_index, task_name="switch_aliases")
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by Django 5.1.2 on 2024-10-28 08:51

import redbox_app.redbox_core.models
import storages.backends.s3
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('redbox_core', '0057_citation_text_in_answer'),
]

operations = [
migrations.AlterField(
model_name='file',
name='original_file',
field=models.FileField(storage=storages.backends.s3.S3Storage, upload_to=redbox_app.redbox_core.models.build_s3_key),
),
]
55 changes: 30 additions & 25 deletions django_app/redbox_app/redbox_core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,9 +536,21 @@ def __init__(self, file):
super().__init__(f"{file.pk} is inactive, status is {file.status}")


def build_s3_key(instance, filename: str) -> str:
"""the s3-key is the primary key for a file,
this needs to be unique so that if a user uploads a file with the same name as
1. an existing file that they own, then it is overwritten
2. an existing file that another user owns then a new file is created
"""
return f"{instance.user.email}/{filename}"


class File(UUIDPrimaryKeyBase, TimeStampedModel):
status = models.CharField(choices=StatusEnum.choices, null=False, blank=False)
original_file = models.FileField(storage=settings.STORAGES["default"]["BACKEND"])
original_file = models.FileField(
storage=settings.STORAGES["default"]["BACKEND"],
upload_to=build_s3_key,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this upload_to is the key change

)
user = models.ForeignKey(User, on_delete=models.CASCADE)
original_file_name = models.TextField(max_length=2048, blank=True, null=True)
last_referenced = models.DateTimeField(blank=True, null=True)
Expand All @@ -547,7 +559,7 @@ class File(UUIDPrimaryKeyBase, TimeStampedModel):
)

def __str__(self) -> str: # pragma: no cover
return f"{self.original_file_name} {self.user}"
return self.file_name

def save(self, *args, **kwargs):
if not self.last_referenced:
Expand All @@ -573,22 +585,12 @@ def delete_from_elastic(self):
if es_client.indices.exists(index=index):
es_client.delete_by_query(
index=index,
body={"query": {"term": {"metadata.file_name.keyword": self.unique_name}}},
body={"query": {"term": {"metadata.file_name.keyword": self.s3_key}}},
)

def update_status_from_core(self, status_label):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unused

match status_label:
case "complete":
self.status = StatusEnum.complete
case "failed":
self.status = StatusEnum.errored
case _:
self.status = StatusEnum.processing
self.save()

@property
def file_type(self) -> str:
name = self.original_file.name
name = self.file_name
return name.split(".")[-1]

@property
Expand All @@ -608,7 +610,7 @@ def url(self) -> URL | None:
ClientMethod="get_object",
Params={
"Bucket": settings.AWS_STORAGE_BUCKET_NAME,
"Key": self.name,
"Key": self.file_name,
},
)
return URL(url)
Expand All @@ -620,18 +622,21 @@ def url(self) -> URL | None:
return URL(self.original_file.url)

@property
def name(self) -> str:
# User-facing name
try:
return self.original_file_name or self.original_file.name
except ValueError as e:
logger.exception("attempt to access non-existent file %s", self.pk, exc_info=e)
def file_name(self) -> str:
if self.original_file_name: # delete me?
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for backwards compatability

return self.original_file_name

# could have a stronger (regex?) way of stripping the users email address?
if "/" not in self.original_file.name:
msg = "expected filename to start with the user's email address"
raise ValueError(msg)
return self.original_file.name.split("/")[1]

@property
def unique_name(self) -> str:
# Name used when processing files that exist in S3
def s3_key(self) -> str:
"""primary key for accessing file in s3"""
if self.status in INACTIVE_STATUSES:
logger.exception("Attempt to access unique_name for inactive file %s with status %s", self.pk, self.status)
logger.exception("Attempt to access s3-key for inactive file %s with status %s", self.pk, self.status)
raise InactiveFileError(self)
return self.original_file.name

Expand Down Expand Up @@ -780,7 +785,7 @@ def save(self, force_insert=False, force_update=False, using=None, update_fields
@property
def uri(self) -> str:
"""returns either the url of an external citation or the file uri of a user-uploaded document"""
return self.url or f"file://{self.file.original_file_name}"
return self.url or f"file://{self.file.file_name}"


class ChatMessage(UUIDPrimaryKeyBase, TimeStampedModel):
Expand Down
2 changes: 1 addition & 1 deletion django_app/redbox_app/redbox_core/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class FileSerializer(serializers.ModelSerializer):
class Meta:
model = File
fields = ("original_file_name",)
fields = ("file_name",)


class ChatMessageTokenUseSerializer(serializers.ModelSerializer):
Expand Down
7 changes: 3 additions & 4 deletions django_app/redbox_app/redbox_core/views/document_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def post(self, request: HttpRequest) -> HttpResponse:
# ingest errors are handled differently, as the other documents have started uploading by this point
ingest_error = self.ingest_file(uploaded_file, request.user)
if ingest_error:
ingest_errors.append(f"{uploaded_file.name}: {ingest_error[0]}")
ingest_errors.append(f"{uploaded_file.file_name}: {ingest_error[0]}")

request.session["ingest_errors"] = ingest_errors
return redirect(reverse("documents"))
Expand Down Expand Up @@ -137,13 +137,12 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
status=StatusEnum.processing.value,
user=user,
original_file=uploaded_file,
original_file_name=uploaded_file.name,
)
except (ValueError, FieldError, ValidationError) as e:
logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e)
return e.args
else:
async_task(ingest, file.id, task_name=file.unique_name, group="ingest")
async_task(ingest, file.id, task_name=file.s3_key, group="ingest")


@login_required
Expand All @@ -169,7 +168,7 @@ def remove_doc_view(request, doc_id: uuid):
return render(
request,
template_name="remove-doc.html",
context={"request": request, "doc_id": doc_id, "doc_name": file.name, "errors": errors},
context={"request": request, "doc_id": doc_id, "doc_name": file.file_name, "errors": errors},
)


Expand Down
1 change: 0 additions & 1 deletion django_app/redbox_app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@
AWS_S3_OBJECT_PARAMETERS = {"ContentDisposition": "attachment"}
AWS_STORAGE_BUCKET_NAME = BUCKET_NAME # this duplication is required for django-storage
OBJECT_STORE = env.str("OBJECT_STORE")
AWS_S3_FILE_OVERWRITE = False # allows users to have duplicate file names

STORAGES = {
"default": {
Expand Down
2 changes: 1 addition & 1 deletion django_app/redbox_app/templates/citation_fragment.html
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ <h3 class="govuk-heading-s govuk-!-margin-bottom-0">
{% if citation.url %} {# an external reference #}
<a class="iai-chat-bubbles__sources-link govuk-link" href="{{ citation.url }}">{{ citation.url }}</a>
{% else %} {# a user doc #}
<a class="iai-chat-bubbles__sources-link govuk-link" href="{{ citation.file.url }}">{{ citation.file.original_file_name }}</a>
<a class="iai-chat-bubbles__sources-link govuk-link" href="{{ citation.file.url }}">{{ citation.file.file_name }}</a>
{% endif %}
</h3>
{% if citation.page_numbers %}
Expand Down
2 changes: 1 addition & 1 deletion django_app/redbox_app/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def ingest(file_id: UUID, es_index: str | None = None) -> None:

logging.info("Ingesting file: %s", file)

if error := ingest_file(file.unique_name, es_index):
if error := ingest_file(file.s3_key, es_index):
file.status = StatusEnum.errored
file.ingest_error = error
else:
Expand Down
3 changes: 1 addition & 2 deletions django_app/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ def uploaded_file(alice: User, original_file: UploadedFile, s3_client) -> File:
file = File.objects.create(
user=alice,
original_file=original_file,
original_file_name=original_file.name,
last_referenced=datetime.now(tz=UTC) - timedelta(days=14),
status=StatusEnum.processing,
)
file.save()
yield file
file.citation_set.all().delete()
file.delete()


Expand Down Expand Up @@ -260,7 +260,6 @@ def several_files(alice: User, number_to_create: int = 4) -> Sequence[File]:
File.objects.create(
user=alice,
original_file=SimpleUploadedFile(filename, b"Lorem Ipsum."),
original_file_name=filename,
status=StatusEnum.complete,
)
)
Expand Down
1 change: 0 additions & 1 deletion django_app/tests/management/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,6 @@ def test_update_users(alice: User):
file = File.objects.create(
user=alice,
original_file=original_file,
original_file_name=original_file.name,
last_referenced=datetime.now(tz=UTC) - timedelta(days=14),
status=StatusEnum.processing,
)
Expand Down
2 changes: 1 addition & 1 deletion django_app/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_message_serializer(chat_message_with_citation_and_tokens: ChatMessage):
for k, v in expected.items():
assert actual[k] == v, k

assert actual["source_files"][0]["original_file_name"].startswith("original_file")
assert actual["source_files"][0]["file_name"].startswith("original_file")

for k, v in expected_token_usage[0].items():
assert actual["token_use"][0][k] == v, k
Expand Down
Loading
Loading