ProjectTech4DevAI · nishika26 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py b/backend/app/alembic/versions/051_add_columns_to_collection_job_and_documents.py
@@ -0,0 +1,62 @@
+"""add columns to collection job and documents table
+
+Revision ID: 051
+Revises: 050
+Create Date: 2026-03-25 10:09:47.318575
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "051"
+down_revision = "050"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column(
+        "collection_jobs",
+        sa.Column(
+            "docs_num",
+            sa.Integer(),
+            nullable=True,
+            comment="Total number of documents to be processed in this job",
+        ),
+    )
+    op.add_column(
+        "collection_jobs",
+        sa.Column(
+            "total_size_mb",
+            sa.Float(),
+            nullable=True,
+            comment="Total size of documents being uploaded to collection in MB",
+        ),
+    )
+    op.add_column(
+        "collection_jobs",
+        sa.Column(
+            "documents",
+            sa.JSON(),
+            nullable=True,
+            comment="List of documents given to make collection",
+        ),
+    )
+    op.add_column(
+        "document",
+        sa.Column(
+            "file_size_kb",
+            sa.Float(),
+            nullable=True,
+            comment="Size of the document in kilobytes (KB)",
+        ),
+    )
+
+
+def downgrade():
+    op.drop_column("document", "file_size_kb")
+    op.drop_column("collection_jobs", "total_size_mb")
+    op.drop_column("collection_jobs", "docs_num")
+    op.drop_column("collection_jobs", "documents")
diff --git a/backend/app/api/docs/collections/create.md b/backend/app/api/docs/collections/create.md
@@ -3,9 +3,10 @@ pipeline:
 
 * Create a vector store from the document IDs you received after uploading your
   documents through the Documents module.
-* The `batch_size` parameter controls how many documents are sent to OpenAI in a
-  single transaction when creating the vector store. This helps optimize the upload
-  process for large document sets. If not specified, the default value is **10**.
+* Documents are automatically batched when creating the vector store to optimize
+  the upload process for large document sets. A new batch is created when either
+  the cumulative size reaches 30 MB of documents given to upload to a vector store
+  or the document count reaches 200 files in a batch, whichever limit is hit first.
 * [Deprecated] Attach the Vector Store to an OpenAI
   [Assistant](https://platform.openai.com/docs/api-reference/assistants). Use
   parameters in the request body relevant to an Assistant to flesh out

diff --git a/backend/app/api/routes/collection_job.py b/backend/app/api/routes/collection_job.py
@@ -17,7 +17,6 @@
     CollectionActionType,
     CollectionJobPublic,
 )
-from app.models.collection import CollectionPublic
 from app.utils import APIResponse, load_description
 from app.services.collections.helpers import extract_error_message, to_collection_public
 

diff --git a/backend/app/api/routes/collections.py b/backend/app/api/routes/collections.py
@@ -95,12 +95,16 @@ def create_collection(
     if request.name:
         ensure_unique_name(session, current_user.project_.id, request.name)
 
+    unique_documents = list(dict.fromkeys(request.documents))
+
     collection_job_crud = CollectionJobCrud(session, current_user.project_.id)
     collection_job = collection_job_crud.create(
         CollectionJobCreate(
             action_type=CollectionActionType.CREATE,
             project_id=current_user.project_.id,
             status=CollectionJobStatus.PENDING,
+            docs_num=len(unique_documents),
+            documents=[str(doc_id) for doc_id in unique_documents],
         )
     )
 

diff --git a/backend/app/api/routes/documents.py b/backend/app/api/routes/documents.py
@@ -12,6 +12,7 @@
     UploadFile,
 )
 from fastapi import Path as FastPath
+from fastapi import HTTPException
 
 from app.api.deps import AuthContextDep, SessionDep
 from app.api.permissions import Permission, require_permission
@@ -27,8 +28,9 @@
     DocTransformationJobPublic,
 )
 from app.core.cloud import get_cloud_storage
-from app.services.collections.helpers import pick_service_for_documennt
+from app.services.collections.helpers import pick_service_for_documennt, MAX_DOC_SIZE_MB
 from app.services.documents.helpers import (
+    calculate_file_size,
     schedule_transformation,
     pre_transform_validation,
     build_document_schema,
@@ -129,6 +131,20 @@ async def upload_doc(
         transformer=transformer,
     )
 
+    file_size_kb = calculate_file_size(src)
+    file_size_mb = file_size_kb / 1024
+
+    if file_size_mb > MAX_DOC_SIZE_MB:
+        logger.warning(
+            f"[upload_doc] Document size exceeds limit | "
+            f"{{'filename': '{src.filename}', 'size_mb': {round(file_size_mb, 2)}, 'max_size_mb': {MAX_DOC_SIZE_MB}}}"
+        )
+        raise HTTPException(
+            status_code=413,
+            detail=f"Document size ({round(file_size_mb, 2)} MB) exceeds the maximum allowed size of {MAX_DOC_SIZE_MB} MB. "
+            f"Please upload a smaller file.",
+        )
+
     storage = get_cloud_storage(session=session, project_id=current_user.project_.id)
     document_id = uuid4()
     object_store_url = storage.put(src, Path(str(document_id)))
@@ -137,6 +153,7 @@ async def upload_doc(
     document = Document(
         id=document_id,
         fname=src.filename,
+        file_size_kb=file_size_kb,
         object_store_url=str(object_store_url),
     )
     source_document = crud.update(document)

diff --git a/backend/app/core/cloud/storage.py b/backend/app/core/cloud/storage.py
@@ -125,6 +125,11 @@ def stream(self, url: str) -> StreamingBody:
         """Stream a file from storage"""
         pass
 
+    @abstractmethod
+    def get(self, url: str) -> bytes:
+        """Get file contents as bytes (for files that fit in memory)"""
+        pass
+
     @abstractmethod
     def get_file_size_kb(self, url: str) -> float:
         """Return the file size in KB"""
@@ -193,6 +198,25 @@ def stream(self, url: str) -> StreamingBody:
             )
             raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
 
+    def get(self, url: str) -> bytes:
+        name = SimpleStorageName.from_url(url)
+        kwargs = asdict(name)
+        try:
+            body = self.aws.client.get_object(**kwargs).get("Body")
+            content = body.read()
+            logger.info(
+                f"[AmazonCloudStorage.get] File retrieved successfully | "
+                f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'size_bytes': {len(content)}}}"
+            )
+            return content
+        except ClientError as err:
+            logger.error(
+                f"[AmazonCloudStorage.get] AWS get error | "
+                f"{{'project_id': '{self.project_id}', 'bucket': '{mask_string(name.Bucket)}', 'key': '{mask_string(name.Key)}', 'error': '{str(err)}'}}",
+                exc_info=True,
+            )
+            raise CloudStorageError(f'AWS Error: "{err}" ({url})') from err
+
     def get_file_size_kb(self, url: str) -> float:
         name = SimpleStorageName.from_url(url)
         kwargs = asdict(name)

diff --git a/backend/app/crud/rag/open_ai.py b/backend/app/crud/rag/open_ai.py
@@ -1,13 +1,13 @@
 import json
 import logging
 import functools as ft
+from io import BytesIO
 from typing import Iterable
 
 from openai import OpenAI, OpenAIError
 from pydantic import BaseModel
 
 from app.core.cloud import CloudStorage
-from app.core.config import settings
 from app.models import Document
 
 logger = logging.getLogger(__name__)
@@ -121,15 +121,13 @@ def update(
         storage: CloudStorage,
         documents: Iterable[Document],
     ):
-        files = []
         for docs in documents:
+            files = []
             for d in docs:
-                f_obj = storage.stream(d.object_store_url)
-
-                # monkey patch botocore.response.StreamingBody to make
-                # OpenAI happy
+                # Get file bytes and wrap in BytesIO for OpenAI API
+                content = storage.get(d.object_store_url)
+                f_obj = BytesIO(content)
                 f_obj.name = d.fname
-
                 files.append(f_obj)
 
             logger.info(
@@ -143,31 +141,11 @@ def update(
                 f"[OpenAIVectorStoreCrud.update] File upload completed | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}"
             )
             if req.file_counts.completed != req.file_counts.total:
-                view = {x.fname: x for x in docs}
-                for i in self.read(vector_store_id):
-                    if i.last_error is None:
-                        fname = self.client.files.retrieve(i.id)
-                        view.pop(fname)
-
-                error = {
-                    "error": "OpenAI document processing error",
-                    "documents": list(view.values()),
-                }
-                try:
-                    raise InterruptedError(json.dumps(error, cls=BaseModelEncoder))
-                except InterruptedError as err:
-                    logger.error(
-                        f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'error': '{error['error']}', 'failed_documents': {len(error['documents'])}}}",
-                        exc_info=True,
-                    )
-                    raise
-
-            while files:
-                f_obj = files.pop()
-                f_obj.close()
-                logger.info(
-                    f"[OpenAIVectorStoreCrud.update] Closed file stream | {{'vector_store_id': '{vector_store_id}', 'filename': '{f_obj.name}'}}"
+                error_msg = f"OpenAI document processing error: {req.file_counts.completed}/{req.file_counts.total} files completed"
+                logger.error(
+                    f"[OpenAIVectorStoreCrud.update] Document processing error | {{'vector_store_id': '{vector_store_id}', 'completed_files': {req.file_counts.completed}, 'total_files': {req.file_counts.total}}}"
                 )
+                raise InterruptedError(error_msg)
 
             yield from docs
 

diff --git a/backend/app/models/collection.py b/backend/app/models/collection.py
@@ -4,7 +4,7 @@
 from uuid import UUID, uuid4
 
 from pydantic import HttpUrl, model_validator, model_serializer
-from sqlalchemy import UniqueConstraint, Index, text
+from sqlalchemy import Index, text
 from sqlmodel import Field, Relationship, SQLModel
 
 from app.core.util import now
@@ -39,12 +39,10 @@ class Collection(SQLModel, table=True):
         description="Unique identifier for the collection",
         sa_column_kwargs={"comment": "Unique identifier for the collection"},
     )
-    provider: ProviderType = (
-        Field(
-            nullable=False,
-            description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)",
-            sa_column_kwargs={"comment": "LLM provider used for this collection"},
-        ),
+    provider: ProviderType = Field(
+        nullable=False,
+        description="LLM provider used for this collection (e.g., 'openai', 'bedrock', 'google', etc)",
+        sa_column_kwargs={"comment": "LLM provider used for this collection"},
     )
     llm_service_id: str = Field(
         nullable=False,
@@ -102,14 +100,6 @@ class CollectionOptions(SQLModel):
     documents: list[UUID] = Field(
         description="List of document IDs",
     )
-    batch_size: int = Field(
-        default=10,
-        description=(
-            "Number of documents to send to OpenAI in a single "
-            "transaction. See the `file_ids` parameter in the "
-            "vector store [create batch](https://platform.openai.com/docs/api-reference/vector-stores-file-batches/createBatch)."
-        ),
-    )
 
     def model_post_init(self, __context: Any):
         self.documents = list(set(self.documents))

diff --git a/backend/app/models/collection_job.py b/backend/app/models/collection_job.py
@@ -2,7 +2,8 @@
 from enum import Enum
 from uuid import UUID, uuid4
 
-from sqlmodel import Column, Field, SQLModel, Text
+from pydantic import field_validator
+from sqlmodel import JSON, Column, Field, SQLModel, Text
 
 from app.core.util import now
 from app.models.collection import CollectionIDPublic, CollectionPublic
@@ -53,12 +54,32 @@ class CollectionJob(SQLModel, table=True):
         description="Tracing ID for correlating logs and traces.",
         sa_column_kwargs={"comment": "Tracing ID for correlating logs and traces"},
     )
+    docs_num: int | None = Field(
+        default=None,
+        description="Total number of documents to be processed in this job",
+        sa_column_kwargs={
+            "comment": "Total number of documents to be processed in this job"
+        },
+    )
+    total_size_mb: float | None = Field(
+        default=None,
+        description="Total size of documents being uploaded to collection in MB",
+        sa_column_kwargs={
+            "comment": "Total size of documents being uploaded to collection in MB"
+        },
+    )
     error_message: str | None = Field(
         default=None,
         sa_column=Column(
             Text, nullable=True, comment="Error message if the job failed"
         ),
     )
+    documents: list[str] | None = Field(
+        default=None,
+        sa_column=Column(
+            JSON, nullable=True, comment="List of documents given to make collection"
+        ),
+    )
 
     # Foreign keys
     collection_id: UUID | None = Field(
@@ -106,14 +127,17 @@ class CollectionJobCreate(SQLModel):
     collection_id: UUID | None = None
     status: CollectionJobStatus
     action_type: CollectionActionType
+    docs_num: int | None = None
     project_id: int
+    documents: list[str] | None = None
 
 
 class CollectionJobUpdate(SQLModel):
     task_id: str | None = None
     status: CollectionJobStatus | None = None
     error_message: str | None = None
     collection_id: UUID | None = None
+    total_size_mb: float | None = None
     trace_id: str | None = None
 
 

diff --git a/backend/app/models/document.py b/backend/app/models/document.py
@@ -41,6 +41,11 @@ class Document(DocumentBase, table=True):
         default=False,
         sa_column_kwargs={"comment": "Soft delete flag"},
     )
+    file_size_kb: float | None = Field(
+        default=None,
+        description="The size of the document in kilobytes",
+        sa_column_kwargs={"comment": "Size of the document in kilobytes (KB)"},
+    )
 
     # Foreign keys
     source_document_id: UUID | None = Field(
@@ -80,9 +85,6 @@ class DocumentPublic(DocumentBase):
     updated_at: datetime = Field(
         description="The timestamp when the document was last updated"
     )
-    signed_url: str | None = Field(
-        default=None, description="A signed URL for accessing the document"
-    )
 
 
 class TransformedDocumentPublic(DocumentPublic):