Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/rationalise file name #1127

Merged
merged 21 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion django_app/redbox_app/redbox_core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def reupload(self, _request, queryset):
async_task(ingest, file.id)
logger.info("Successfully reuploaded file %s.", file)

list_display = ["original_file_name", "user", "status", "created_at", "last_referenced"]
list_display = ["file_name", "user", "status", "created_at", "last_referenced"]
list_filter = ["user", "status"]
date_hierarchy = "created_at"
actions = ["reupload"]
Expand Down
10 changes: 5 additions & 5 deletions django_app/redbox_app/redbox_core/consumers.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,20 +273,20 @@ async def handle_documents(self, response: list[Document]):
for ref, sources in sources_by_resource_ref.items():
try:
file = await File.objects.aget(original_file=ref)
payload = {"url": str(file.url), "original_file_name": file.original_file_name}
payload = {"url": str(file.url), "file_name": file.file_name}
response_sources = [
Source(
source=str(file.url),
source_type=Citation.Origin.USER_UPLOADED_DOCUMENT,
document_name=file.original_file_name,
document_name=file.file_name,
highlighted_text_in_source=cited_chunk.page_content,
page_numbers=parse_page_number(cited_chunk.metadata.get("page_number")),
)
for cited_chunk in sources
]
except File.DoesNotExist:
file = None
payload = {"url": ref, "original_file_name": None}
payload = {"url": ref, "file_name": None}
response_sources = [
Source(
source=cited_chunk.metadata["uri"],
Expand All @@ -307,10 +307,10 @@ async def handle_citations(self, citations: list[AICitation]):
for s in c.sources:
try:
file = await File.objects.aget(original_file=s.source)
payload = {"url": str(file.url), "original_file_name": file.original_file_name}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we still want to use original_file_name here so we see 'stuff.pdf' in the frontend rather than 'me@cabinetoffice/stuff.pdf'?

Copy link
Collaborator Author

@gecBurton gecBurton Oct 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

file_name will strip out me@cabinetoffice/ and just return stuff.pdf

TBH i was hoping that we could just delete original_file_name altogether in 30 days (this PR means that its no longer getting populated)

payload = {"url": str(file.url), "file_name": file.file_name}
except File.DoesNotExist:
file = None
payload = {"url": s.source, "original_file_name": s.document_name}
payload = {"url": s.source, "file_name": s.document_name}
await self.send_to_client("source", payload)
self.citations.append((file, s))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def handle(self, *_args, **kwargs):
for file in File.objects.exclude(status__in=INACTIVE_STATUSES):
logger.debug("Reingesting file object %s", file)
async_task(
ingest, file.id, new_index, task_name=file.original_file.name, group="re-ingest", sync=kwargs["sync"]
ingest,
file.id,
new_index,
task_name=file.file_name,
group="re-ingest",
sync=kwargs["sync"],
)
async_task(switch_aliases, env.elastic_chunk_alias, new_index, task_name="switch_aliases")
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by Django 5.1.2 on 2024-10-28 08:51

import redbox_app.redbox_core.models
import storages.backends.s3
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('redbox_core', '0057_citation_text_in_answer'),
]

operations = [
migrations.AlterField(
model_name='file',
name='original_file',
field=models.FileField(storage=storages.backends.s3.S3Storage, upload_to=redbox_app.redbox_core.models.build_s3_key),
),
]
51 changes: 28 additions & 23 deletions django_app/redbox_app/redbox_core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,9 +536,21 @@ def __init__(self, file):
super().__init__(f"{file.pk} is inactive, status is {file.status}")


def build_s3_key(instance, filename: str) -> str:
"""the s3-key is the primary key for a file,
this needs to be unique so that if a user uploads a file with the same name as
1. an existing file that they own, then it is overwritten
2. an existing file that another user owns then a new file is created
"""
return f"{instance.user.email}/{filename}"


class File(UUIDPrimaryKeyBase, TimeStampedModel):
status = models.CharField(choices=StatusEnum.choices, null=False, blank=False)
original_file = models.FileField(storage=settings.STORAGES["default"]["BACKEND"])
original_file = models.FileField(
storage=settings.STORAGES["default"]["BACKEND"],
upload_to=build_s3_key,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this upload_to is the key change

)
user = models.ForeignKey(User, on_delete=models.CASCADE)
original_file_name = models.TextField(max_length=2048, blank=True, null=True)
last_referenced = models.DateTimeField(blank=True, null=True)
Expand All @@ -547,7 +559,7 @@ class File(UUIDPrimaryKeyBase, TimeStampedModel):
)

def __str__(self) -> str: # pragma: no cover
return f"{self.original_file_name} {self.user}"
return self.file_name

def save(self, *args, **kwargs):
if not self.last_referenced:
Expand Down Expand Up @@ -576,19 +588,9 @@ def delete_from_elastic(self):
body={"query": {"term": {"metadata.file_name.keyword": self.unique_name}}},
)

def update_status_from_core(self, status_label):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unused

match status_label:
case "complete":
self.status = StatusEnum.complete
case "failed":
self.status = StatusEnum.errored
case _:
self.status = StatusEnum.processing
self.save()

@property
def file_type(self) -> str:
name = self.original_file.name
name = self.file_name
return name.split(".")[-1]

@property
Expand All @@ -608,7 +610,7 @@ def url(self) -> URL | None:
ClientMethod="get_object",
Params={
"Bucket": settings.AWS_STORAGE_BUCKET_NAME,
"Key": self.name,
"Key": self.file_name,
},
)
return URL(url)
Expand All @@ -620,18 +622,21 @@ def url(self) -> URL | None:
return URL(self.original_file.url)

@property
def name(self) -> str:
# User-facing name
try:
return self.original_file_name or self.original_file.name
except ValueError as e:
logger.exception("attempt to access non-existent file %s", self.pk, exc_info=e)
def file_name(self) -> str:
if self.original_file_name: # delete me?
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for backwards compatability

return self.original_file_name

# could have a stronger (regex?) way of stripping the users email address?
if "/" not in self.original_file.name:
msg = "expected filename to start with the user's email address"
raise ValueError(msg)
return self.original_file.name.split("/")[1]

@property
def unique_name(self) -> str:
# Name used when processing files that exist in S3
"""primary key for accessing file in s3"""
if self.status in INACTIVE_STATUSES:
logger.exception("Attempt to access unique_name for inactive file %s with status %s", self.pk, self.status)
logger.exception("Attempt to access s3-key for inactive file %s with status %s", self.pk, self.status)
raise InactiveFileError(self)
return self.original_file.name

Expand Down Expand Up @@ -780,7 +785,7 @@ def save(self, force_insert=False, force_update=False, using=None, update_fields
@property
def uri(self) -> str:
"""returns either the url of an external citation or the file uri of a user-uploaded document"""
return self.url or f"file://{self.file.original_file_name}"
return self.url or f"file://{self.file.file_name}"


class ChatMessage(UUIDPrimaryKeyBase, TimeStampedModel):
Expand Down
2 changes: 1 addition & 1 deletion django_app/redbox_app/redbox_core/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class FileSerializer(serializers.ModelSerializer):
class Meta:
model = File
fields = ("original_file_name",)
fields = ("file_name",)


class ChatMessageTokenUseSerializer(serializers.ModelSerializer):
Expand Down
5 changes: 2 additions & 3 deletions django_app/redbox_app/redbox_core/views/document_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def post(self, request: HttpRequest) -> HttpResponse:
# ingest errors are handled differently, as the other documents have started uploading by this point
ingest_error = self.ingest_file(uploaded_file, request.user)
if ingest_error:
ingest_errors.append(f"{uploaded_file.name}: {ingest_error[0]}")
ingest_errors.append(f"{uploaded_file.file_name}: {ingest_error[0]}")

request.session["ingest_errors"] = ingest_errors
return redirect(reverse("documents"))
Expand Down Expand Up @@ -137,7 +137,6 @@ def ingest_file(uploaded_file: UploadedFile, user: User) -> Sequence[str]:
status=StatusEnum.processing.value,
user=user,
original_file=uploaded_file,
original_file_name=uploaded_file.name,
)
except (ValueError, FieldError, ValidationError) as e:
logger.exception("Error creating File model object for %s.", uploaded_file, exc_info=e)
Expand Down Expand Up @@ -169,7 +168,7 @@ def remove_doc_view(request, doc_id: uuid):
return render(
request,
template_name="remove-doc.html",
context={"request": request, "doc_id": doc_id, "doc_name": file.name, "errors": errors},
context={"request": request, "doc_id": doc_id, "doc_name": file.file_name, "errors": errors},
)


Expand Down
1 change: 0 additions & 1 deletion django_app/redbox_app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,6 @@
AWS_S3_OBJECT_PARAMETERS = {"ContentDisposition": "attachment"}
AWS_STORAGE_BUCKET_NAME = BUCKET_NAME # this duplication is required for django-storage
OBJECT_STORE = env.str("OBJECT_STORE")
AWS_S3_FILE_OVERWRITE = False # allows users to have duplicate file names

STORAGES = {
"default": {
Expand Down
2 changes: 1 addition & 1 deletion django_app/redbox_app/templates/citation_fragment.html
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ <h3 class="govuk-heading-s govuk-!-margin-bottom-0">
{% if citation.url %} {# an external reference #}
<a class="iai-chat-bubbles__sources-link govuk-link" href="{{ citation.url }}">{{ citation.url }}</a>
{% else %} {# a user doc #}
<a class="iai-chat-bubbles__sources-link govuk-link" href="{{ citation.file.url }}">{{ citation.file.original_file_name }}</a>
<a class="iai-chat-bubbles__sources-link govuk-link" href="{{ citation.file.url }}">{{ citation.file.file_name }}</a>
{% endif %}
</h3>
{% if citation.page_numbers %}
Expand Down
3 changes: 1 addition & 2 deletions django_app/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ def uploaded_file(alice: User, original_file: UploadedFile, s3_client) -> File:
file = File.objects.create(
user=alice,
original_file=original_file,
original_file_name=original_file.name,
last_referenced=datetime.now(tz=UTC) - timedelta(days=14),
status=StatusEnum.processing,
)
file.save()
yield file
file.citation_set.all().delete()
file.delete()


Expand Down Expand Up @@ -260,7 +260,6 @@ def several_files(alice: User, number_to_create: int = 4) -> Sequence[File]:
File.objects.create(
user=alice,
original_file=SimpleUploadedFile(filename, b"Lorem Ipsum."),
original_file_name=filename,
status=StatusEnum.complete,
)
)
Expand Down
1 change: 0 additions & 1 deletion django_app/tests/management/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,6 @@ def test_update_users(alice: User):
file = File.objects.create(
user=alice,
original_file=original_file,
original_file_name=original_file.name,
last_referenced=datetime.now(tz=UTC) - timedelta(days=14),
status=StatusEnum.processing,
)
Expand Down
2 changes: 1 addition & 1 deletion django_app/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def test_message_serializer(chat_message_with_citation_and_tokens: ChatMessage):
for k, v in expected.items():
assert actual[k] == v, k

assert actual["source_files"][0]["original_file_name"].startswith("original_file")
assert actual["source_files"][0]["file_name"].startswith("original_file")

for k, v in expected_token_usage[0].items():
assert actual["token_use"][0][k] == v, k
Expand Down
12 changes: 6 additions & 6 deletions django_app/tests/test_consumers.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ async def test_chat_consumer_with_new_session(alice: User, uploaded_file: File,
assert response4["type"] == "route"
assert response4["data"] == "gratitude"
assert response5["type"] == "source"
assert response5["data"]["original_file_name"] == uploaded_file.original_file_name
assert response5["data"]["file_name"] == uploaded_file.file_name
# Close
await communicator.disconnect()

Expand Down Expand Up @@ -188,7 +188,7 @@ async def test_chat_consumer_with_naughty_question(alice: User, uploaded_file: F
assert response4["type"] == "route"
assert response4["data"] == "gratitude"
assert response5["type"] == "source"
assert response5["data"]["original_file_name"] == uploaded_file.original_file_name
assert response5["data"]["file_name"] == uploaded_file.file_name
# Close
await communicator.disconnect()

Expand Down Expand Up @@ -226,7 +226,7 @@ async def test_chat_consumer_with_naughty_citation(
assert response3["type"] == "route"
assert response3["data"] == "gratitude"
assert response4["type"] == "source"
assert response4["data"]["original_file_name"] == uploaded_file.original_file_name
assert response4["data"]["file_name"] == uploaded_file.file_name
# Close
await communicator.disconnect()

Expand Down Expand Up @@ -265,7 +265,7 @@ async def test_chat_consumer_agentic(alice: User, uploaded_file: File, mocked_co
assert response4["type"] == "route"
assert response4["data"] == "search/agentic"
assert response5["type"] == "source"
assert response5["data"]["original_file_name"] == uploaded_file.original_file_name
assert response5["data"]["file_name"] == uploaded_file.file_name
# Close
await communicator.disconnect()

Expand Down Expand Up @@ -322,7 +322,7 @@ async def test_chat_consumer_with_selected_files(
connected, _ = await communicator.connect()
assert connected

selected_file_core_uuids: Sequence[str] = [f.s3_key for f in selected_files]
selected_file_core_uuids: Sequence[str] = [f.unique_name for f in selected_files]
await communicator.send_json_to(
{
"message": "Third question, with selected files?",
Expand Down Expand Up @@ -795,7 +795,7 @@ def mocked_connect_with_several_files(several_files: Sequence[File]) -> Connect:
json.dumps(
{
"resource_type": "documents",
"data": [{"s3_key": f.s3_key, "page_content": "a secret forth answer"} for f in several_files[2:]],
"data": [{"s3_key": f.unique_name, "page_content": "a secret forth answer"} for f in several_files[2:]],
}
),
json.dumps({"resource_type": "end"}),
Expand Down
4 changes: 0 additions & 4 deletions django_app/tests/test_migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def test_0012_alter_file_status(migrator):
file = File.objects.create(
user=user,
original_file=original_file,
original_file_name=original_file.name,
)
chat_message.source_files.set([file])
chat_message.save()
Expand Down Expand Up @@ -58,7 +57,6 @@ def test_0020_remove_chatmessage_source_files_textchunk_and_more(migrator):
file = File.objects.create(
user=user,
original_file=original_file,
original_file_name=original_file.name,
)
chat_message.source_files.set([file])
chat_message.save()
Expand Down Expand Up @@ -115,7 +113,6 @@ def test_0027_alter_file_status(migrator):
File.objects.create(
user=user,
original_file=original_file,
original_file_name=original_file.name,
status=status_option[0],
)
)
Expand Down Expand Up @@ -343,7 +340,6 @@ def test_0055_citation_source_citation_url_alter_citation_file(original_file, mi
file = File.objects.create(
user=user,
original_file=original_file,
original_file_name=original_file.name,
)

Citation = old_state.apps.get_model("redbox_core", "Citation")
Expand Down
3 changes: 0 additions & 3 deletions django_app/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ def test_file_model_last_referenced(peter_rabbit, s3_client): # noqa: ARG001
status=StatusEnum.processing,
original_file=mock_file,
user=peter_rabbit,
original_file_name="test.txt",
)

# Tests the initial value of the last_referenced
Expand Down Expand Up @@ -53,7 +52,6 @@ def test_file_model_unique_name(status: str, peter_rabbit: User, s3_client): #
status=status,
original_file=mock_file,
user=peter_rabbit,
original_file_name="test.txt",
)

assert new_file.unique_name # Check new name can be retrieved without error
Expand All @@ -74,7 +72,6 @@ def test_file_model_unique_name_error_states(status: str, peter_rabbit: User, s3
status=status,
original_file=mock_file,
user=peter_rabbit,
original_file_name="test.txt",
)

with pytest.raises(InactiveFileError, match="is inactive, status is"):
Expand Down
Loading
Loading