Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""add columns to collection job and documents table

Revision ID: 051
Revises: 050
Create Date: 2026-03-25 10:09:47.318575

"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "051"
down_revision = "050"
branch_labels = None
depends_on = None


def upgrade():
op.add_column(
"collection_jobs",
sa.Column(
"docs_num",
sa.Integer(),
nullable=True,
comment="Total number of documents to be processed in this job",
),
)
op.add_column(
"collection_jobs",
sa.Column(
"total_size_mb",
sa.Float(),
nullable=True,
comment="Total size of documents being uploaded to collection in MB",
),
)
op.add_column(
"collection_jobs",
sa.Column(
"documents",
sa.JSON(),
nullable=True,
comment="List of documents given to make collection",
),
)
op.add_column(
"document",
sa.Column(
"file_size_kb",
sa.Float(),
nullable=True,
comment="Size of the document in kilobytes (KB)",
),
)


def downgrade():
op.drop_column("document", "file_size_kb")
op.drop_column("collection_jobs", "total_size_mb")
op.drop_column("collection_jobs", "docs_num")
op.drop_column("collection_jobs", "documents")
7 changes: 4 additions & 3 deletions backend/app/api/docs/collections/create.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ pipeline:

* Create a vector store from the document IDs you received after uploading your
documents through the Documents module.
* The `batch_size` parameter controls how many documents are sent to OpenAI in a
single transaction when creating the vector store. This helps optimize the upload
process for large document sets. If not specified, the default value is **10**.
* Documents are automatically batched when creating the vector store to optimize
the upload process for large document sets. A new batch is created when either
the cumulative size reaches 30 MB of documents given to upload to a vector store
or the document count reaches 200 files in a batch, whichever limit is hit first.
* [Deprecated] Attach the Vector Store to an OpenAI
[Assistant](https://platform.openai.com/docs/api-reference/assistants). Use
parameters in the request body relevant to an Assistant to flesh out
Expand Down
1 change: 0 additions & 1 deletion backend/app/api/routes/collection_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
CollectionActionType,
CollectionJobPublic,
)
from app.models.collection import CollectionPublic
from app.utils import APIResponse, load_description
from app.services.collections.helpers import extract_error_message, to_collection_public

Expand Down
4 changes: 4 additions & 0 deletions backend/app/api/routes/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,16 @@ def create_collection(
if request.name:
ensure_unique_name(session, current_user.project_.id, request.name)

unique_documents = list(dict.fromkeys(request.documents))

collection_job_crud = CollectionJobCrud(session, current_user.project_.id)
collection_job = collection_job_crud.create(
CollectionJobCreate(
action_type=CollectionActionType.CREATE,
project_id=current_user.project_.id,
status=CollectionJobStatus.PENDING,
docs_num=len(unique_documents),
documents=[str(doc_id) for doc_id in unique_documents],
)
)

Expand Down
19 changes: 18 additions & 1 deletion backend/app/api/routes/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
UploadFile,
)
from fastapi import Path as FastPath
from fastapi import HTTPException

from app.api.deps import AuthContextDep, SessionDep
from app.api.permissions import Permission, require_permission
Expand All @@ -27,8 +28,9 @@
DocTransformationJobPublic,
)
from app.core.cloud import get_cloud_storage
from app.services.collections.helpers import pick_service_for_documennt
from app.services.collections.helpers import pick_service_for_documennt, MAX_DOC_SIZE_MB
from app.services.documents.helpers import (
calculate_file_size,
schedule_transformation,
pre_transform_validation,
build_document_schema,
Expand Down Expand Up @@ -129,6 +131,20 @@ async def upload_doc(
transformer=transformer,
)

file_size_kb = calculate_file_size(src)
file_size_mb = file_size_kb / 1024

if file_size_mb > MAX_DOC_SIZE_MB:
logger.warning(
f"[upload_doc] Document size exceeds limit | "
f"{{'filename': '{src.filename}', 'size_mb': {round(file_size_mb, 2)}, 'max_size_mb': {MAX_DOC_SIZE_MB}}}"
)
raise HTTPException(
status_code=413,
detail=f"Document size ({round(file_size_mb, 2)} MB) exceeds the maximum allowed size of {MAX_DOC_SIZE_MB} MB. "
f"Please upload a smaller file.",
)

storage = get_cloud_storage(session=session, project_id=current_user.project_.id)
document_id = uuid4()
object_store_url = storage.put(src, Path(str(document_id)))
Expand All @@ -137,6 +153,7 @@ async def upload_doc(
document = Document(
id=document_id,
fname=src.filename,
file_size_kb=file_size_kb,
object_store_url=str(object_store_url),
)
source_document = crud.update(document)
Expand Down
24 changes: 24 additions & 0 deletions backend/app/core/cloud/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ def stream(self, url: str) -> StreamingBody:
"""Stream a file from storage"""
pass

@abstractmethod
def get(self, url: str) -> bytes:
"""Get file contents as bytes (for files that fit in memory)"""
pass

@abstractmethod
def get_file_size_kb(self, url: str) -> float:
"""Return the file size in KB"""
Expand Down Expand Up @@ -193,6 +198,25 @@ def stream(self, url: str) -> StreamingBody:
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err

def get(self, url: str) -> bytes:
name = SimpleStorageName.from_url(url)
kwargs = asdict(name)
try:
body = self.aws.client.get_object(**kwargs).get("Body")
content = body.read()
logger.info(
f"[AmazonCloudStorage.get] File retrieved successfully | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'size_bytes': {len(content)}}}"
)
return content
except ClientError as err:
logger.error(
f"[AmazonCloudStorage.get] AWS get error | "
f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
exc_info=True,
)
raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err

def get_file_size_kb(self, url: str) -> float:
name = SimpleStorageName.from_url(url)
kwargs = asdict(name)
Expand Down
40 changes: 9 additions & 31 deletions backend/app/crud/rag/open_ai.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import json
import logging
import functools as ft
from io import BytesIO
from typing import Iterable

from openai import OpenAI, OpenAIError
from pydantic import BaseModel

from app.core.cloud import CloudStorage
from app.core.config import settings
from app.models import Document

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -121,15 +121,13 @@ def update(
storage: CloudStorage,
documents: Iterable[Document],
):
files = []
for docs in documents:
files = []
for d in docs:
f_obj = storage.stream(d.object_store_url)

# monkey patch botocore.response.StreamingBody to make
# OpenAI happy
# Get file bytes and wrap in BytesIO for OpenAI API
content = storage.get(d.object_store_url)
f_obj = BytesIO(content)
f_obj.name = d.fname

files.append(f_obj)

logger.info(
Expand All @@ -143,31 +141,11 @@ def update(
f"[OpenAIVectorStoreCrud.update] File upload completed | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}"
)
if req.file_counts.completed != req.file_counts.total:
view = {x.fname: x for x in docs}
for i in self.read(vector_store_id):
if i.last_error is None:
fname = self.client.files.retrieve(i.id)
view.pop(fname)

error = {
"error": "OpenAI document processing error",
"documents": list(view.values()),
}
try:
raise InterruptedError(json.dumps(error, cls=BaseModelEncoder))
except InterruptedError as err:
logger.error(
f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'error': '{error['error']}', 'failed_documents': {len(error['documents'])}}}",
exc_info=True,
)
raise

while files:
f_obj = files.pop()
f_obj.close()
logger.info(
f"[OpenAIVectorStoreCrud.update] Closed file stream | {{'vector_store_id': '{vector_store_id}', 'filename': '{f_obj.name}'}}"
error_msg = f"OpenAI document processing error: {req.file_counts.completed}/{req.file_counts.total} files completed"
logger.error(
f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}"
)
raise InterruptedError(error_msg)

yield from docs

Expand Down
20 changes: 5 additions & 15 deletions backend/app/models/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from uuid import UUID, uuid4

from pydantic import HttpUrl, model_validator, model_serializer
from sqlalchemy import UniqueConstraint, Index, text
from sqlalchemy import Index, text
from sqlmodel import Field, Relationship, SQLModel

from app.core.util import now
Expand Down Expand Up @@ -39,12 +39,10 @@ class Collection(SQLModel, table=True):
description="Unique identifier for the collection",
sa_column_kwargs={"comment": "Unique identifier for the collection"},
)
provider: ProviderType = (
Field(
nullable=False,
description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)",
sa_column_kwargs={"comment": "LLM provider used for this collection"},
),
provider: ProviderType = Field(
nullable=False,
description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)",
sa_column_kwargs={"comment": "LLM provider used for this collection"},
)
llm_service_id: str = Field(
nullable=False,
Expand Down Expand Up @@ -102,14 +100,6 @@ class CollectionOptions(SQLModel):
documents: list[UUID] = Field(
description="List of document IDs",
)
batch_size: int = Field(
default=10,
description=(
"Number of documents to send to OpenAI in a single "
"transaction. See the `file_ids` parameter in the "
"vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)."
),
)

def model_post_init(self, __context: Any):
self.documents = list(set(self.documents))
Expand Down
26 changes: 25 additions & 1 deletion backend/app/models/collection_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from enum import Enum
from uuid import UUID, uuid4

from sqlmodel import Column, Field, SQLModel, Text
from pydantic import field_validator
from sqlmodel import JSON, Column, Field, SQLModel, Text

from app.core.util import now
from app.models.collection import CollectionIDPublic, CollectionPublic
Expand Down Expand Up @@ -53,12 +54,32 @@ class CollectionJob(SQLModel, table=True):
description="Tracing ID for correlating logs and traces.",
sa_column_kwargs={"comment": "Tracing ID for correlating logs and traces"},
)
docs_num: int | None = Field(
default=None,
description="Total number of documents to be processed in this job",
sa_column_kwargs={
"comment": "Total number of documents to be processed in this job"
},
)
total_size_mb: float | None = Field(
default=None,
description="Total size of documents being uploaded to collection in MB",
sa_column_kwargs={
"comment": "Total size of documents being uploaded to collection in MB"
},
)
error_message: str | None = Field(
default=None,
sa_column=Column(
Text, nullable=True, comment="Error message if the job failed"
),
)
documents: list[str] | None = Field(
default=None,
sa_column=Column(
JSON, nullable=True, comment="List of documents given to make collection"
),
)

# Foreign keys
collection_id: UUID | None = Field(
Expand Down Expand Up @@ -106,14 +127,17 @@ class CollectionJobCreate(SQLModel):
collection_id: UUID | None = None
status: CollectionJobStatus
action_type: CollectionActionType
docs_num: int | None = None
project_id: int
documents: list[str] | None = None


class CollectionJobUpdate(SQLModel):
task_id: str | None = None
status: CollectionJobStatus | None = None
error_message: str | None = None
collection_id: UUID | None = None
total_size_mb: float | None = None
trace_id: str | None = None


Expand Down
8 changes: 5 additions & 3 deletions backend/app/models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ class Document(DocumentBase, table=True):
default=False,
sa_column_kwargs={"comment": "Soft delete flag"},
)
file_size_kb: float | None = Field(
default=None,
description="The size of the document in kilobytes",
sa_column_kwargs={"comment": "Size of the document in kilobytes (KB)"},
)

# Foreign keys
source_document_id: UUID | None = Field(
Expand Down Expand Up @@ -80,9 +85,6 @@ class DocumentPublic(DocumentBase):
updated_at: datetime = Field(
description="The timestamp when the document was last updated"
)
signed_url: str | None = Field(
default=None, description="A signed URL for accessing the document"
)


class TransformedDocumentPublic(DocumentPublic):
Expand Down
Loading
Loading