monoai.rag

RAG is a module that provides a high-level interface for performing semantic search queries against a vector database. It supports multiple vector database backends and embedding providers for flexible deployment scenarios.

 1"""
 2RAG is a module that provides a high-level interface for performing semantic search queries
 3against a vector database. It supports multiple vector database backends and
 4embedding providers for flexible deployment scenarios.
 5"""
 6
 7from .rag import RAG
 8from .vectordb import ChromaVectorDB
 9from .documents_builder import DocumentsBuilder
10
11__all__ = ['RAG', 'ChromaVectorDB', 'DocumentsBuilder'] 
class RAG:
  7class RAG:
  8    """
  9    Retrieval-Augmented Generation (RAG) system for semantic search and document retrieval.
 10    
 11    This class provides a high-level interface for performing semantic search queries
 12    against a vector database. It supports multiple vector database backends and
 13    embedding providers for flexible deployment scenarios.
 14    
 15    The RAG system works by:
 16    1. Converting text queries into vector embeddings
 17    2. Searching the vector database for similar document embeddings
 18    3. Returning the most relevant documents based on semantic similarity
 19    
 20    Attributes:
 21        _vectorizer (str): The embedding model used for vectorization
 22        _db (str): Name of the vector database
 23        _vector_db (ChromaVectorDB): The vector database backend
 24    
 25    Examples:
 26    --------
 27     Basic usage with default settings:
 28        
 29    ```python
 30    # Initialize RAG with a database name
 31    rag = RAG(database="my_documents")
 32        
 33    # Perform a semantic search
 34    results = rag.query("What is machine learning?", k=5)
 35    ```
 36        
 37    Using with specific embedding provider:
 38        
 39    ```python
 40    # Initialize with OpenAI embeddings
 41    rag = RAG(
 42        database="my_documents",
 43        provider="openai",
 44        vectorizer="text-embedding-ada-002"
 45    )
 46        
 47    # Search for relevant documents
 48    results = rag.query("Explain neural networks", k=10)
 49    ```
 50        
 51    Working with different vector databases:
 52        
 53    ```python
 54    # Currently supports ChromaDB
 55    rag = RAG(
 56        database="my_collection",
 57        vector_db="chroma",
 58        provider="openai",
 59        vectorizer="text-embedding-ada-002"
 60    )
 61    ```
 62
 63    Add RAG to a model, so that the model can use the RAG automatically to answer questions:
 64    ```python
 65    model = Model(provider="openai", model="gpt-4o-mini")
 66    model._add_rag(RAG(database="my_documents", vector_db="chroma"))
 67    ```
 68
 69    """
 70
 71    def __init__(self, 
 72                database: str,
 73                 provider: Optional[str] = None,
 74                 vectorizer: Optional[str] = None, 
 75                 vector_db: str = "chroma"):
 76        """
 77        Initialize the RAG system.
 78        
 79        Parameters:
 80        -----------
 81        database : str
 82            Name of the vector database/collection to use for storage and retrieval.
 83            This will be created if it doesn't exist.
 84            
 85        provider : str, optional
 86            The embedding provider to use (e.g., "openai", "anthropic", "cohere").
 87            If provided, the corresponding API key will be loaded automatically.
 88            If None, the system will use default embedding settings.
 89            
 90        vectorizer : str, optional
 91            The specific embedding model to use for vectorization.
 92            Examples: "text-embedding-ada-002", "text-embedding-3-small", "embed-english-v3.0"
 93            If None, the provider's default model will be used.
 94            
 95        vector_db : str, default="chroma"
 96            The vector database backend to use. Currently supports:
 97            - "chroma": ChromaDB (default, recommended for most use cases)
 98            
 99        Raises:
100        -------
101        ValueError
102            If an unsupported vector database is specified.
103            
104        Examples:
105        ---------
106        ```python
107        # Minimal initialization
108        rag = RAG("my_documents")
109        
110        # With OpenAI embeddings
111        rag = RAG(
112            database="research_papers",
113            provider="openai",
114            vectorizer="text-embedding-ada-002"
115        )
116        
117        # With Anthropic embeddings
118        rag = RAG(
119            database="articles",
120            provider="anthropic",
121            vectorizer="text-embedding-3-small"
122        )
123        ```
124        """
125        if provider:
126            load_key(provider)
127            
128        self._vectorizer = vectorizer
129        self._db = database
130        
131        if vector_db == "chroma":
132            self._vector_db = ChromaVectorDB(
133                name=database, 
134                vectorizer_provider=provider, 
135                vectorizer_model=vectorizer
136            )
137        else:
138            raise ValueError(f"Vector database '{vector_db}' not supported. Currently only 'chroma' is supported.")
139        
140
141    def query(self, query: str, k: int = 10) -> Dict[str, Any]:
142        """
143        Perform a semantic search query against the vector database.
144        
145        This method converts the input query into a vector embedding and searches
146        the database for the most semantically similar documents.
147        
148        Parameters:
149        -----------
150        query : str
151            The text query to search for. This will be converted to a vector
152            embedding and used to find similar documents.
153            
154        k : int, default=10
155            The number of most relevant documents to return. Higher values
156            return more results but may include less relevant documents.
157            
158        Returns:
159        --------
160        Dict[str, Any]
161            A dictionary containing the search results with the following structure:
162            {
163                'ids': List[List[str]] - Document IDs of the retrieved documents,
164                'documents': List[List[str]] - The actual document content,
165                'metadatas': List[List[Dict]] - Metadata for each document,
166                'distances': List[List[float]] - Similarity scores (lower = more similar)
167            }
168            
169        Examples:
170        ---------
171        ```python
172        # Basic query
173        results = rag.query("What is artificial intelligence?")
174        
175        # Query with more results
176        results = rag.query("Machine learning algorithms", k=20)
177        
178        # Accessing results
179        for i, (doc_id, document, metadata, distance) in enumerate(zip(
180            results['ids'][0], 
181            results['documents'][0], 
182            results['metadatas'][0], 
183            results['distances'][0]
184        )):
185            print(f"Result {i+1}:")
186            print(f"  ID: {doc_id}")
187            print(f"  Content: {document[:100]}...")
188            print(f"  Similarity: {1 - distance:.3f}")
189            print(f"  Metadata: {metadata}")
190            print()
191        ```
192        
193        Notes:
194        ------
195        - The query is automatically converted to lowercase and processed
196        - Results are returned in order of relevance (most similar first)
197        - Distance scores are cosine distances (0 = identical, 2 = completely opposite)
198        - If fewer than k documents exist in the database, all available documents are returned
199        """
200        return self._vector_db.query(query, k)

Retrieval-Augmented Generation (RAG) system for semantic search and document retrieval.

This class provides a high-level interface for performing semantic search queries against a vector database. It supports multiple vector database backends and embedding providers for flexible deployment scenarios.

The RAG system works by:

  1. Converting text queries into vector embeddings
  2. Searching the vector database for similar document embeddings
  3. Returning the most relevant documents based on semantic similarity

Attributes: _vectorizer (str): The embedding model used for vectorization _db (str): Name of the vector database _vector_db (ChromaVectorDB): The vector database backend

Examples:

Basic usage with default settings:

# Initialize RAG with a database name
rag = RAG(database="my_documents")

# Perform a semantic search
results = rag.query("What is machine learning?", k=5)

Using with specific embedding provider:

# Initialize with OpenAI embeddings
rag = RAG(
    database="my_documents",
    provider="openai",
    vectorizer="text-embedding-ada-002"
)

# Search for relevant documents
results = rag.query("Explain neural networks", k=10)

Working with different vector databases:

# Currently supports ChromaDB
rag = RAG(
    database="my_collection",
    vector_db="chroma",
    provider="openai",
    vectorizer="text-embedding-ada-002"
)

Add RAG to a model, so that the model can use the RAG automatically to answer questions:

model = Model(provider="openai", model="gpt-4o-mini")
model._add_rag(RAG(database="my_documents", vector_db="chroma"))
RAG( database: str, provider: Optional[str] = None, vectorizer: Optional[str] = None, vector_db: str = 'chroma')
 71    def __init__(self, 
 72                database: str,
 73                 provider: Optional[str] = None,
 74                 vectorizer: Optional[str] = None, 
 75                 vector_db: str = "chroma"):
 76        """
 77        Initialize the RAG system.
 78        
 79        Parameters:
 80        -----------
 81        database : str
 82            Name of the vector database/collection to use for storage and retrieval.
 83            This will be created if it doesn't exist.
 84            
 85        provider : str, optional
 86            The embedding provider to use (e.g., "openai", "anthropic", "cohere").
 87            If provided, the corresponding API key will be loaded automatically.
 88            If None, the system will use default embedding settings.
 89            
 90        vectorizer : str, optional
 91            The specific embedding model to use for vectorization.
 92            Examples: "text-embedding-ada-002", "text-embedding-3-small", "embed-english-v3.0"
 93            If None, the provider's default model will be used.
 94            
 95        vector_db : str, default="chroma"
 96            The vector database backend to use. Currently supports:
 97            - "chroma": ChromaDB (default, recommended for most use cases)
 98            
 99        Raises:
100        -------
101        ValueError
102            If an unsupported vector database is specified.
103            
104        Examples:
105        ---------
106        ```python
107        # Minimal initialization
108        rag = RAG("my_documents")
109        
110        # With OpenAI embeddings
111        rag = RAG(
112            database="research_papers",
113            provider="openai",
114            vectorizer="text-embedding-ada-002"
115        )
116        
117        # With Anthropic embeddings
118        rag = RAG(
119            database="articles",
120            provider="anthropic",
121            vectorizer="text-embedding-3-small"
122        )
123        ```
124        """
125        if provider:
126            load_key(provider)
127            
128        self._vectorizer = vectorizer
129        self._db = database
130        
131        if vector_db == "chroma":
132            self._vector_db = ChromaVectorDB(
133                name=database, 
134                vectorizer_provider=provider, 
135                vectorizer_model=vectorizer
136            )
137        else:
138            raise ValueError(f"Vector database '{vector_db}' not supported. Currently only 'chroma' is supported.")

Initialize the RAG system.

Parameters:

database : str Name of the vector database/collection to use for storage and retrieval. This will be created if it doesn't exist.

provider : str, optional The embedding provider to use (e.g., "openai", "anthropic", "cohere"). If provided, the corresponding API key will be loaded automatically. If None, the system will use default embedding settings.

vectorizer : str, optional The specific embedding model to use for vectorization. Examples: "text-embedding-ada-002", "text-embedding-3-small", "embed-english-v3.0" If None, the provider's default model will be used.

vector_db : str, default="chroma" The vector database backend to use. Currently supports: - "chroma": ChromaDB (default, recommended for most use cases)

Raises:

ValueError If an unsupported vector database is specified.

Examples:

# Minimal initialization
rag = RAG("my_documents")

# With OpenAI embeddings
rag = RAG(
    database="research_papers",
    provider="openai",
    vectorizer="text-embedding-ada-002"
)

# With Anthropic embeddings
rag = RAG(
    database="articles",
    provider="anthropic",
    vectorizer="text-embedding-3-small"
)
def query(self, query: str, k: int = 10) -> Dict[str, Any]:
141    def query(self, query: str, k: int = 10) -> Dict[str, Any]:
142        """
143        Perform a semantic search query against the vector database.
144        
145        This method converts the input query into a vector embedding and searches
146        the database for the most semantically similar documents.
147        
148        Parameters:
149        -----------
150        query : str
151            The text query to search for. This will be converted to a vector
152            embedding and used to find similar documents.
153            
154        k : int, default=10
155            The number of most relevant documents to return. Higher values
156            return more results but may include less relevant documents.
157            
158        Returns:
159        --------
160        Dict[str, Any]
161            A dictionary containing the search results with the following structure:
162            {
163                'ids': List[List[str]] - Document IDs of the retrieved documents,
164                'documents': List[List[str]] - The actual document content,
165                'metadatas': List[List[Dict]] - Metadata for each document,
166                'distances': List[List[float]] - Similarity scores (lower = more similar)
167            }
168            
169        Examples:
170        ---------
171        ```python
172        # Basic query
173        results = rag.query("What is artificial intelligence?")
174        
175        # Query with more results
176        results = rag.query("Machine learning algorithms", k=20)
177        
178        # Accessing results
179        for i, (doc_id, document, metadata, distance) in enumerate(zip(
180            results['ids'][0], 
181            results['documents'][0], 
182            results['metadatas'][0], 
183            results['distances'][0]
184        )):
185            print(f"Result {i+1}:")
186            print(f"  ID: {doc_id}")
187            print(f"  Content: {document[:100]}...")
188            print(f"  Similarity: {1 - distance:.3f}")
189            print(f"  Metadata: {metadata}")
190            print()
191        ```
192        
193        Notes:
194        ------
195        - The query is automatically converted to lowercase and processed
196        - Results are returned in order of relevance (most similar first)
197        - Distance scores are cosine distances (0 = identical, 2 = completely opposite)
198        - If fewer than k documents exist in the database, all available documents are returned
199        """
200        return self._vector_db.query(query, k)

Perform a semantic search query against the vector database.

This method converts the input query into a vector embedding and searches the database for the most semantically similar documents.

Parameters:

query : str The text query to search for. This will be converted to a vector embedding and used to find similar documents.

k : int, default=10 The number of most relevant documents to return. Higher values return more results but may include less relevant documents.

Returns:

Dict[str, Any] A dictionary containing the search results with the following structure: { 'ids': List[List[str]] - Document IDs of the retrieved documents, 'documents': List[List[str]] - The actual document content, 'metadatas': List[List[Dict]] - Metadata for each document, 'distances': List[List[float]] - Similarity scores (lower = more similar) }

Examples:

# Basic query
results = rag.query("What is artificial intelligence?")

# Query with more results
results = rag.query("Machine learning algorithms", k=20)

# Accessing results
for i, (doc_id, document, metadata, distance) in enumerate(zip(
    results['ids'][0], 
    results['documents'][0], 
    results['metadatas'][0], 
    results['distances'][0]
)):
    print(f"Result {i+1}:")
    print(f"  ID: {doc_id}")
    print(f"  Content: {document[:100]}...")
    print(f"  Similarity: {1 - distance:.3f}")
    print(f"  Metadata: {metadata}")
    print()

Notes:

  • The query is automatically converted to lowercase and processed
  • Results are returned in order of relevance (most similar first)
  • Distance scores are cosine distances (0 = identical, 2 = completely opposite)
  • If fewer than k documents exist in the database, all available documents are returned
class ChromaVectorDB(monoai.rag.vectordb._BaseVectorDB):
172class ChromaVectorDB(_BaseVectorDB):
173    """
174    ChromaDB implementation of the vector database interface.
175    
176    This class provides a concrete implementation of the vector database
177    using ChromaDB as the backend. ChromaDB is an open-source embedding
178    database that supports persistent storage and efficient similarity search.
179    
180    Features:
181    - Persistent storage of document embeddings
182    - Efficient similarity search with configurable result count
183    - Metadata storage for each document
184    - Automatic collection creation if it doesn't exist
185    - Support for custom embedding models via LiteLLM
186    
187    Attributes:
188        _client (chromadb.PersistentClient): ChromaDB client instance
189        _collection (chromadb.Collection): Active collection for operations
190    
191    Examples:
192    --------
193    Basic usage:
194    
195    ```python
196    # Initialize with a new collection
197    vector_db = ChromaVectorDB(name="my_documents")
198    
199    # Add documents
200    documents = ["Document 1 content", "Document 2 content"]
201    metadatas = [{"source": "file1.txt"}, {"source": "file2.txt"}]
202    ids = ["doc1", "doc2"]
203    
204    vector_db.add(documents, metadatas, ids)
205    
206    # Search for similar documents
207    results = vector_db.query("search query", k=5)
208    ```
209    
210    Using with specific embedding model:
211    
212    ```python
213    # Initialize with OpenAI embeddings
214    vector_db = ChromaVectorDB(
215        name="research_papers",
216        vectorizer_provider="openai",
217        vectorizer_model="text-embedding-ada-002"
218    )
219    ```
220    """
221
222    def __init__(self, name: Optional[str] = None, 
223                 vectorizer_provider: Optional[str] = None, 
224                 vectorizer_model: Optional[str] = None):
225        """
226        Initialize the ChromaDB vector database.
227        
228        Parameters:
229        -----------
230        name : str, optional
231            Name of the ChromaDB collection. If provided, the collection
232            will be created if it doesn't exist, or connected to if it does.
233            
234        vectorizer_provider : str, optional
235            The embedding provider to use for vectorization.
236            Examples: "openai", "anthropic", "cohere"
237            
238        vectorizer_model : str, optional
239            The specific embedding model to use.
240            Examples: "text-embedding-ada-002", "text-embedding-3-small"
241            
242        Examples:
243        ---------
244        ```python
245        # Create new collection
246        vector_db = ChromaVectorDB("my_documents")
247        
248        # Connect to existing collection with custom embeddings
249        vector_db = ChromaVectorDB(
250            name="existing_collection",
251            vectorizer_provider="openai",
252            vectorizer_model="text-embedding-ada-002"
253        )
254        ```
255        """
256        super().__init__(name, vectorizer_provider, vectorizer_model)
257        try:
258            import chromadb
259        except ImportError:
260            raise ImportError("chromadb is not installed. Please install it with 'pip install chromadb'")
261
262        self._client = chromadb.PersistentClient()
263        if name:
264            try:
265                self._collection = self._client.get_collection(name)
266            except chromadb.errors.NotFoundError:
267                self._collection = self._client.create_collection(name)
268
269    def add(self, documents: List[str], metadatas: List[Dict], ids: List[str]) -> None:
270        """
271        Add documents to the ChromaDB collection.
272        
273        This method adds documents along with their metadata and IDs to the
274        ChromaDB collection. The documents are automatically converted to
275        embeddings using the configured embedding model.
276        
277        Parameters:
278        -----------
279        documents : List[str]
280            List of text documents to add to the database.
281            Each document will be converted to a vector embedding.
282            
283        metadatas : List[Dict]
284            List of metadata dictionaries for each document.
285            Each metadata dict can contain any key-value pairs for
286            document categorization and filtering.
287            
288        ids : List[str]
289            List of unique identifiers for each document.
290            IDs must be unique within the collection.
291            
292        Raises:
293        -------
294        ValueError
295            If the lengths of documents, metadatas, and ids don't match.
296            
297        Examples:
298        ---------
299        ```python
300        # Add documents with metadata
301        documents = [
302            "Machine learning is a subset of artificial intelligence.",
303            "Deep learning uses neural networks with multiple layers."
304        ]
305        
306        metadatas = [
307            {"topic": "machine_learning", "source": "textbook", "year": 2023},
308            {"topic": "deep_learning", "source": "research_paper", "year": 2023}
309        ]
310        
311        ids = ["doc_001", "doc_002"]
312        
313        vector_db.add(documents, metadatas, ids)
314        ```
315        
316        Notes:
317        ------
318        - All three lists must have the same length
319        - IDs must be unique within the collection
320        - Documents are automatically embedded using the configured model
321        - Metadata can be used for filtering during queries
322        """
323        if not (len(documents) == len(metadatas) == len(ids)):
324            raise ValueError("documents, metadatas, and ids must have the same length")
325            
326        self._collection.add(
327            documents=documents,
328            metadatas=metadatas,
329            ids=ids
330        )
331
332    def query(self, query: str, k: int = 10) -> Dict[str, Any]:
333        """
334        Search for similar documents in the ChromaDB collection.
335        
336        This method performs semantic search by converting the query to an
337        embedding and finding the most similar document embeddings in the
338        collection.
339        
340        Parameters:
341        -----------
342        query : str
343            The text query to search for. This will be converted to a
344            vector embedding and compared against stored documents.
345            
346        k : int, default=10
347            Number of most similar documents to return. Higher values
348            return more results but may include less relevant documents.
349            
350        Returns:
351        --------
352        Dict[str, Any]
353            A dictionary containing search results with the following structure:
354            {
355                'ids': List[List[str]] - Document IDs of retrieved documents,
356                'documents': List[List[str]] - The actual document content,
357                'metadatas': List[List[Dict]] - Metadata for each document,
358                'distances': List[List[float]] - Similarity scores (lower = more similar)
359            }
360            
361        Examples:
362        ---------
363        ```python
364        # Basic search
365        results = vector_db.query("What is machine learning?", k=5)
366        
367        # Access results
368        for i, (doc_id, document, metadata, distance) in enumerate(zip(
369            results['ids'][0], 
370            results['documents'][0], 
371            results['metadatas'][0], 
372            results['distances'][0]
373        )):
374            print(f"Result {i+1}:")
375            print(f"  ID: {doc_id}")
376            print(f"  Content: {document[:100]}...")
377            print(f"  Similarity: {1 - distance:.3f}")
378            print(f"  Metadata: {metadata}")
379        ```
380        
381        Notes:
382        ------
383        - Results are returned in order of similarity (most similar first)
384        - Distance scores are cosine distances (0 = identical, 2 = opposite)
385        - If fewer than k documents exist, all available documents are returned
386        - The query is automatically embedded using the same model as stored documents
387        """
388        results = self._collection.query(
389            query_texts=query,
390            n_results=k
391        )
392        return results

ChromaDB implementation of the vector database interface.

This class provides a concrete implementation of the vector database using ChromaDB as the backend. ChromaDB is an open-source embedding database that supports persistent storage and efficient similarity search.

Features:

  • Persistent storage of document embeddings
  • Efficient similarity search with configurable result count
  • Metadata storage for each document
  • Automatic collection creation if it doesn't exist
  • Support for custom embedding models via LiteLLM

Attributes: _client (chromadb.PersistentClient): ChromaDB client instance _collection (chromadb.Collection): Active collection for operations

Examples:

Basic usage:

# Initialize with a new collection
vector_db = ChromaVectorDB(name="my_documents")

# Add documents
documents = ["Document 1 content", "Document 2 content"]
metadatas = [{"source": "file1.txt"}, {"source": "file2.txt"}]
ids = ["doc1", "doc2"]

vector_db.add(documents, metadatas, ids)

# Search for similar documents
results = vector_db.query("search query", k=5)

Using with specific embedding model:

# Initialize with OpenAI embeddings
vector_db = ChromaVectorDB(
    name="research_papers",
    vectorizer_provider="openai",
    vectorizer_model="text-embedding-ada-002"
)
ChromaVectorDB( name: Optional[str] = None, vectorizer_provider: Optional[str] = None, vectorizer_model: Optional[str] = None)
222    def __init__(self, name: Optional[str] = None, 
223                 vectorizer_provider: Optional[str] = None, 
224                 vectorizer_model: Optional[str] = None):
225        """
226        Initialize the ChromaDB vector database.
227        
228        Parameters:
229        -----------
230        name : str, optional
231            Name of the ChromaDB collection. If provided, the collection
232            will be created if it doesn't exist, or connected to if it does.
233            
234        vectorizer_provider : str, optional
235            The embedding provider to use for vectorization.
236            Examples: "openai", "anthropic", "cohere"
237            
238        vectorizer_model : str, optional
239            The specific embedding model to use.
240            Examples: "text-embedding-ada-002", "text-embedding-3-small"
241            
242        Examples:
243        ---------
244        ```python
245        # Create new collection
246        vector_db = ChromaVectorDB("my_documents")
247        
248        # Connect to existing collection with custom embeddings
249        vector_db = ChromaVectorDB(
250            name="existing_collection",
251            vectorizer_provider="openai",
252            vectorizer_model="text-embedding-ada-002"
253        )
254        ```
255        """
256        super().__init__(name, vectorizer_provider, vectorizer_model)
257        try:
258            import chromadb
259        except ImportError:
260            raise ImportError("chromadb is not installed. Please install it with 'pip install chromadb'")
261
262        self._client = chromadb.PersistentClient()
263        if name:
264            try:
265                self._collection = self._client.get_collection(name)
266            except chromadb.errors.NotFoundError:
267                self._collection = self._client.create_collection(name)

Initialize the ChromaDB vector database.

Parameters:

name : str, optional Name of the ChromaDB collection. If provided, the collection will be created if it doesn't exist, or connected to if it does.

vectorizer_provider : str, optional The embedding provider to use for vectorization. Examples: "openai", "anthropic", "cohere"

vectorizer_model : str, optional The specific embedding model to use. Examples: "text-embedding-ada-002", "text-embedding-3-small"

Examples:

# Create new collection
vector_db = ChromaVectorDB("my_documents")

# Connect to existing collection with custom embeddings
vector_db = ChromaVectorDB(
    name="existing_collection",
    vectorizer_provider="openai",
    vectorizer_model="text-embedding-ada-002"
)
def add( self, documents: List[str], metadatas: List[Dict], ids: List[str]) -> None:
269    def add(self, documents: List[str], metadatas: List[Dict], ids: List[str]) -> None:
270        """
271        Add documents to the ChromaDB collection.
272        
273        This method adds documents along with their metadata and IDs to the
274        ChromaDB collection. The documents are automatically converted to
275        embeddings using the configured embedding model.
276        
277        Parameters:
278        -----------
279        documents : List[str]
280            List of text documents to add to the database.
281            Each document will be converted to a vector embedding.
282            
283        metadatas : List[Dict]
284            List of metadata dictionaries for each document.
285            Each metadata dict can contain any key-value pairs for
286            document categorization and filtering.
287            
288        ids : List[str]
289            List of unique identifiers for each document.
290            IDs must be unique within the collection.
291            
292        Raises:
293        -------
294        ValueError
295            If the lengths of documents, metadatas, and ids don't match.
296            
297        Examples:
298        ---------
299        ```python
300        # Add documents with metadata
301        documents = [
302            "Machine learning is a subset of artificial intelligence.",
303            "Deep learning uses neural networks with multiple layers."
304        ]
305        
306        metadatas = [
307            {"topic": "machine_learning", "source": "textbook", "year": 2023},
308            {"topic": "deep_learning", "source": "research_paper", "year": 2023}
309        ]
310        
311        ids = ["doc_001", "doc_002"]
312        
313        vector_db.add(documents, metadatas, ids)
314        ```
315        
316        Notes:
317        ------
318        - All three lists must have the same length
319        - IDs must be unique within the collection
320        - Documents are automatically embedded using the configured model
321        - Metadata can be used for filtering during queries
322        """
323        if not (len(documents) == len(metadatas) == len(ids)):
324            raise ValueError("documents, metadatas, and ids must have the same length")
325            
326        self._collection.add(
327            documents=documents,
328            metadatas=metadatas,
329            ids=ids
330        )

Add documents to the ChromaDB collection.

This method adds documents along with their metadata and IDs to the ChromaDB collection. The documents are automatically converted to embeddings using the configured embedding model.

Parameters:

documents : List[str] List of text documents to add to the database. Each document will be converted to a vector embedding.

metadatas : List[Dict] List of metadata dictionaries for each document. Each metadata dict can contain any key-value pairs for document categorization and filtering.

ids : List[str] List of unique identifiers for each document. IDs must be unique within the collection.

Raises:

ValueError If the lengths of documents, metadatas, and ids don't match.

Examples:

# Add documents with metadata
documents = [
    "Machine learning is a subset of artificial intelligence.",
    "Deep learning uses neural networks with multiple layers."
]

metadatas = [
    {"topic": "machine_learning", "source": "textbook", "year": 2023},
    {"topic": "deep_learning", "source": "research_paper", "year": 2023}
]

ids = ["doc_001", "doc_002"]

vector_db.add(documents, metadatas, ids)

Notes:

  • All three lists must have the same length
  • IDs must be unique within the collection
  • Documents are automatically embedded using the configured model
  • Metadata can be used for filtering during queries
def query(self, query: str, k: int = 10) -> Dict[str, Any]:
332    def query(self, query: str, k: int = 10) -> Dict[str, Any]:
333        """
334        Search for similar documents in the ChromaDB collection.
335        
336        This method performs semantic search by converting the query to an
337        embedding and finding the most similar document embeddings in the
338        collection.
339        
340        Parameters:
341        -----------
342        query : str
343            The text query to search for. This will be converted to a
344            vector embedding and compared against stored documents.
345            
346        k : int, default=10
347            Number of most similar documents to return. Higher values
348            return more results but may include less relevant documents.
349            
350        Returns:
351        --------
352        Dict[str, Any]
353            A dictionary containing search results with the following structure:
354            {
355                'ids': List[List[str]] - Document IDs of retrieved documents,
356                'documents': List[List[str]] - The actual document content,
357                'metadatas': List[List[Dict]] - Metadata for each document,
358                'distances': List[List[float]] - Similarity scores (lower = more similar)
359            }
360            
361        Examples:
362        ---------
363        ```python
364        # Basic search
365        results = vector_db.query("What is machine learning?", k=5)
366        
367        # Access results
368        for i, (doc_id, document, metadata, distance) in enumerate(zip(
369            results['ids'][0], 
370            results['documents'][0], 
371            results['metadatas'][0], 
372            results['distances'][0]
373        )):
374            print(f"Result {i+1}:")
375            print(f"  ID: {doc_id}")
376            print(f"  Content: {document[:100]}...")
377            print(f"  Similarity: {1 - distance:.3f}")
378            print(f"  Metadata: {metadata}")
379        ```
380        
381        Notes:
382        ------
383        - Results are returned in order of similarity (most similar first)
384        - Distance scores are cosine distances (0 = identical, 2 = opposite)
385        - If fewer than k documents exist, all available documents are returned
386        - The query is automatically embedded using the same model as stored documents
387        """
388        results = self._collection.query(
389            query_texts=query,
390            n_results=k
391        )
392        return results

Search for similar documents in the ChromaDB collection.

This method performs semantic search by converting the query to an embedding and finding the most similar document embeddings in the collection.

Parameters:

query : str The text query to search for. This will be converted to a vector embedding and compared against stored documents.

k : int, default=10 Number of most similar documents to return. Higher values return more results but may include less relevant documents.

Returns:

Dict[str, Any] A dictionary containing search results with the following structure: { 'ids': List[List[str]] - Document IDs of retrieved documents, 'documents': List[List[str]] - The actual document content, 'metadatas': List[List[Dict]] - Metadata for each document, 'distances': List[List[float]] - Similarity scores (lower = more similar) }

Examples:

# Basic search
results = vector_db.query("What is machine learning?", k=5)

# Access results
for i, (doc_id, document, metadata, distance) in enumerate(zip(
    results['ids'][0], 
    results['documents'][0], 
    results['metadatas'][0], 
    results['distances'][0]
)):
    print(f"Result {i+1}:")
    print(f"  ID: {doc_id}")
    print(f"  Content: {document[:100]}...")
    print(f"  Similarity: {1 - distance:.3f}")
    print(f"  Metadata: {metadata}")

Notes:

  • Results are returned in order of similarity (most similar first)
  • Distance scores are cosine distances (0 = identical, 2 = opposite)
  • If fewer than k documents exist, all available documents are returned
  • The query is automatically embedded using the same model as stored documents
class DocumentsBuilder:
 14class DocumentsBuilder:
 15    """
 16    A utility class for building document collections from various sources.
 17    
 18    This class provides methods to extract text content from files and web pages,
 19    split the content into manageable chunks with configurable size and overlap,
 20    and prepare the data for storage in vector databases.
 21    
 22    The DocumentsBuilder is designed to work seamlessly with the RAG system,
 23    producing output that can be directly used with vector database operations.
 24    
 25    Features:
 26    - File-based document extraction with UTF-8 encoding support
 27    - Text string processing for in-memory content
 28    - Web scraping with multiple engine options (requests, tavily, selenium)
 29    - Word document extraction (.doc and .docx formats)
 30    - PDF document extraction with metadata
 31    - Multiple chunking strategies (word, sentence, paragraph, fixed, semantic)
 32    - Configurable chunk size and overlap parameters
 33    - Rich metadata generation for each document chunk
 34    - Unique ID generation for database storage
 35    
 36    Attributes:
 37        _chunk_strategy (str): The chunking strategy to use
 38        _chunk_size (int): Maximum size of each text chunk in characters
 39        _chunk_overlap (int): Number of characters to overlap between chunks
 40    """
 41
 42    def __init__(
 43        self,
 44        chunk_strategy: str = "word",
 45        chunk_size: int = 1000,
 46        chunk_overlap: int = 0,
 47        custom_split_func: Optional[callable] = None
 48    ):
 49        """
 50        Initialize the DocumentsBuilder with chunking parameters.
 51        """
 52        # If custom_split_func is provided, automatically set strategy to "custom"
 53        if custom_split_func is not None:
 54            chunk_strategy = "custom"
 55        
 56        self._chunk_strategy = chunk_strategy
 57        self._chunk_size = chunk_size
 58        self._chunk_overlap = chunk_overlap
 59        self._custom_split_func = custom_split_func
 60
 61        self._module_cache: Dict[str, object] = {}
 62        
 63        if chunk_overlap >= chunk_size:
 64            raise ValueError(
 65                f"chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size}) "
 66                "to prevent infinite loops. Recommended: chunk_overlap should be 10-20% of chunk_size."
 67            )
 68        
 69        if chunk_size <= 0:
 70            raise ValueError(f"chunk_size must be positive, got {chunk_size}")
 71        
 72        if chunk_overlap < 0:
 73            raise ValueError(f"chunk_overlap must be non-negative, got {chunk_overlap}")
 74        
 75        if chunk_strategy == "custom" and custom_split_func is None:
 76            raise ValueError("custom_split_func must be provided when chunk_strategy='custom'")
 77        
 78        if custom_split_func is not None and not callable(custom_split_func):
 79            raise ValueError("custom_split_func must be callable")
 80
 81    # ---------- Lazy import helpers ----------
 82    @staticmethod
 83    def _has_module(name: str) -> bool:
 84        """Rileva se un modulo è installato senza importarlo completamente."""
 85        spec = importlib.util.find_spec(name)
 86        return spec is not None
 87
 88    def _get_module(self, name: str):
 89        """Importa pigramente un modulo e lo cache-izza."""
 90        mod = self._module_cache.get(name)
 91        if mod is None:
 92            try:
 93                mod = importlib.import_module(name)
 94            except ImportError as e:
 95                raise ImportError(
 96                    f"Il modulo '{name}' è richiesto per questa operazione. "
 97                    f"Installa con: pip install {name}"
 98                ) from e
 99            self._module_cache[name] = mod
100        return mod
101
102    def from_file(self, file_path: str) -> Tuple[List[str], List[Dict], List[str]]:
103        """
104        Read a file and split it into chunks with specified size and overlap.
105        """
106        if not os.path.exists(file_path):
107            raise FileNotFoundError(f"File not found: {file_path}")
108        
109        with open(file_path, 'r', encoding='utf-8') as file:
110            text = file.read()
111        
112        chunks = self._split_text(text)
113        
114        documents = []
115        metadatas = []
116        ids = []
117        
118        for i, chunk in enumerate(chunks):
119            chunk_id = str(uuid.uuid4())
120            metadata = {
121                'file_path': file_path,
122                'file_name': os.path.basename(file_path),
123                'chunk_index': i,
124                'total_chunks': len(chunks),
125                'chunk_size': len(chunk)
126            }
127            documents.append(chunk)
128            metadatas.append(metadata)
129            ids.append(chunk_id)
130        
131        return documents, metadatas, ids
132
133    def from_str(self, text: str, source_name: str = "text_string") -> Tuple[List[str], List[Dict], List[str]]:
134        """
135        Process a text string and split it into chunks with specified size and overlap.
136        """
137        if not text or not text.strip():
138            return [], [], []
139        
140        chunks = self._split_text(text)
141        
142        documents = []
143        metadatas = []
144        ids = []
145        
146        for i, chunk in enumerate(chunks):
147            chunk_id = str(uuid.uuid4())
148            metadata = {
149                'source_type': 'text_string',
150                'source_name': source_name,
151                'chunk_index': i,
152                'total_chunks': len(chunks),
153                'chunk_size': len(chunk),
154                'chunk_strategy': self._chunk_strategy
155            }
156            documents.append(chunk)
157            metadatas.append(metadata)
158            ids.append(chunk_id)
159        
160        return documents, metadatas, ids
161
162    def from_doc(self, file_path: str, extraction_method: str = "auto") -> Tuple[List[str], List[Dict], List[str]]:
163        """
164        Extract text from Word documents (.doc and .docx files) and split into chunks.
165        """
166        if not os.path.exists(file_path):
167            raise FileNotFoundError(f"File not found: {file_path}")
168        
169        file_extension = os.path.splitext(file_path)[1].lower()
170        if file_extension not in ['.doc', '.docx']:
171            raise ValueError(f"Unsupported file format: {file_extension}. Only .doc and .docx files are supported.")
172        
173        # Selezione metodo al volo con lazy detection
174        if extraction_method == "auto":
175            if file_extension == '.docx' and self._has_module("docx"):
176                extraction_method = "docx"
177            elif self._has_module("docx2txt"):
178                extraction_method = "docx2txt"
179            else:
180                raise ImportError(
181                    "docx2txt is required for .docx file extraction. Install with: pip install docx2txt"
182                )
183        
184        if extraction_method == "docx":
185            if file_extension != '.docx':
186                raise ValueError("'docx' extraction method only supports .docx files")
187            text, doc_properties = self._extract_with_docx(file_path)
188        elif extraction_method == "docx2txt":
189            text, doc_properties = self._extract_with_docx2txt(file_path)
190        else:
191            raise ValueError(f"Unsupported extraction method: {extraction_method}")
192        
193        chunks = self._split_text(text)
194        
195        documents = []
196        metadatas = []
197        ids = []
198        
199        for i, chunk in enumerate(chunks):
200            chunk_id = str(uuid.uuid4())
201            metadata = {
202                'file_path': file_path,
203                'file_name': os.path.basename(file_path),
204                'document_format': file_extension[1:],  # Remove the dot
205                'extraction_method': extraction_method,
206                'chunk_index': i,
207                'total_chunks': len(chunks),
208                'chunk_size': len(chunk)
209            }
210            if doc_properties:
211                metadata.update(doc_properties)
212            
213            documents.append(chunk)
214            metadatas.append(metadata)
215            ids.append(chunk_id)
216        
217        return documents, metadatas, ids
218
219    def from_pdf(self, file_path: str, page_range: Optional[Tuple[int, int]] = None) -> Tuple[List[str], List[Dict], List[str]]:
220        """
221        Extract text from PDF documents and split into chunks.
222        """
223        if not os.path.exists(file_path):
224            raise FileNotFoundError(f"File not found: {file_path}")
225        
226        file_extension = os.path.splitext(file_path)[1].lower()
227        if file_extension != '.pdf':
228            raise ValueError(f"Unsupported file format: {file_extension}. Only .pdf files are supported.")
229        
230        # qui PyPDF2 viene importato solo se davvero richiesto
231        if not self._has_module("PyPDF2"):
232            raise ImportError("PyPDF2 is required for PDF file extraction. Install with: pip install PyPDF2")
233        
234        text, pdf_properties, page_info = self._extract_from_pdf(file_path, page_range)
235        
236        chunks = self._split_text(text)
237        
238        documents = []
239        metadatas = []
240        ids = []
241        
242        for i, chunk in enumerate(chunks):
243            chunk_id = str(uuid.uuid4())
244            metadata = {
245                'file_path': file_path,
246                'file_name': os.path.basename(file_path),
247                'document_format': 'pdf',
248                'chunk_index': i,
249                'total_chunks': len(chunks),
250                'chunk_size': len(chunk)
251            }
252            if pdf_properties:
253                metadata.update(pdf_properties)
254            if page_info:
255                metadata.update(page_info)
256            
257            documents.append(chunk)
258            metadatas.append(metadata)
259            ids.append(chunk_id)
260        
261        return documents, metadatas, ids
262
263    def from_url(self, url: str, engine: str = "requests", deep: bool = False) -> Tuple[List[str], List[Dict], List[str]]:
264        """
265        Scrape content from a URL and split it into chunks with specified size and overlap.
266        
267        Notes:
268        - Scraping may take time depending on the engine and website complexity
269        - Some websites may block automated scraping
270        - Selenium requires Chrome/Chromium to be installed
271        - Tavily requires an API key to be configured
272        """
273        from monoai.tools.webscraping import scrape_web
274        result = scrape_web(url, engine=engine, deep=deep)
275                
276        if not result or not result.get("text"):
277            raise ValueError(f"Failed to extract text content from URL: {url}")
278        
279        text = result["text"]
280        chunks = self._split_text(text)
281        
282        documents = []
283        metadatas = []
284        ids = []
285        
286        for i, chunk in enumerate(chunks):
287            chunk_id = str(uuid.uuid4())
288            metadata = {
289                'url': url,
290                'source_type': 'web_page',
291                'scraping_engine': engine,
292                'deep_extraction': deep,
293                'chunk_index': i,
294                'total_chunks': len(chunks),
295                'chunk_size': len(chunk)
296            }
297            documents.append(chunk)
298            metadatas.append(metadata)
299            ids.append(chunk_id)
300        
301        return documents, metadatas, ids
302    
303    def _extract_with_docx(self, file_path: str) -> Tuple[str, Dict]:
304        """
305        Extract text from a .docx file using python-docx library (lazy import).
306        """
307        docx = self._get_module("docx")
308        Document = docx.Document 
309        doc = Document(file_path)
310        
311        # Extract text from paragraphs
312        text_parts = []
313        for paragraph in doc.paragraphs:
314            if paragraph.text.strip():
315                text_parts.append(paragraph.text)
316        
317        # Extract text from tables
318        for table in doc.tables:
319            for row in table.rows:
320                row_text = []
321                for cell in row.cells:
322                    if cell.text.strip():
323                        row_text.append(cell.text.strip())
324                if row_text:
325                    text_parts.append(" | ".join(row_text))
326        
327        text = "\n\n".join(text_parts)
328        
329        # Extract document properties
330        properties = {}
331        core_props = doc.core_properties
332        if core_props.title:
333            properties['document_title'] = core_props.title
334        if core_props.author:
335            properties['document_author'] = core_props.author
336        if core_props.subject:
337            properties['document_subject'] = core_props.subject
338        if core_props.created:
339            properties['document_created'] = str(core_props.created)
340        if core_props.modified:
341            properties['document_modified'] = str(core_props.modified)
342        
343        return text, properties
344    
345    def _extract_with_docx2txt(self, file_path: str) -> Tuple[str, Dict]:
346        """
347        Extract text from a Word document using docx2txt library (lazy import).
348        """
349        docx2txt = self._get_module("docx2txt")
350        text = docx2txt.process(file_path)  
351        return text, {}
352    
353    def _extract_from_pdf(self, file_path: str, page_range: Optional[Tuple[int, int]] = None) -> Tuple[str, Dict, Dict]:
354        """
355        Extract text and metadata from a PDF file using PyPDF2 (lazy import).
356        """
357        PyPDF2 = self._get_module("PyPDF2")
358        with open(file_path, 'rb') as file:
359            pdf_reader = PyPDF2.PdfReader(file) 
360            
361            # Get total number of pages
362            total_pages = len(pdf_reader.pages)
363            
364            # Determine page range
365            if page_range is None:
366                start_page = 1
367                end_page = total_pages
368            else:
369                start_page, end_page = page_range
370                # Validate page range
371                if start_page < 1 or end_page > total_pages or start_page > end_page:
372                    raise ValueError(f"Invalid page range: {page_range}. Pages must be between 1 and {total_pages}")
373            
374            # Extract text from specified pages
375            text_parts = []
376            for page_num in range(start_page - 1, end_page):
377                page = pdf_reader.pages[page_num]
378                page_text = page.extract_text()
379                if page_text and page_text.strip():
380                    text_parts.append(page_text)
381            
382            text = "\n\n".join(text_parts)
383            
384            # Extract PDF properties
385            properties = {}
386            if getattr(pdf_reader, "metadata", None):
387                metadata = pdf_reader.metadata
388
389                def _get(meta, key):
390                    try:
391                        return meta.get(key) if hasattr(meta, "get") else getattr(meta, key, None)
392                    except Exception:
393                        return None
394
395                mapping = [
396                    ('/Title', 'pdf_title'),
397                    ('/Author', 'pdf_author'),
398                    ('/Subject', 'pdf_subject'),
399                    ('/Creator', 'pdf_creator'),
400                    ('/Producer', 'pdf_producer'),
401                    ('/CreationDate', 'pdf_creation_date'),
402                    ('/ModDate', 'pdf_modification_date'),
403                ]
404                for k_src, k_dst in mapping:
405                    val = _get(metadata, k_src)
406                    if val:
407                        properties[k_dst] = str(val)
408            
409            # Add page information
410            page_info = {
411                'total_pages': total_pages,
412                'extracted_pages_start': start_page,
413                'extracted_pages_end': end_page,
414                'extracted_pages_count': end_page - start_page + 1
415            }
416            
417            return text, properties, page_info
418    
419    def _split_text(self, text: str) -> List[str]:
420        """
421        Split text into chunks using the specified chunking strategy.
422        """
423        if len(text) <= self._chunk_size:
424            return [text]
425        
426        if self._chunk_strategy == "word":
427            return self._split_by_words(text)
428        elif self._chunk_strategy == "sentence":
429            return self._split_by_sentences(text)
430        elif self._chunk_strategy == "paragraph":
431            return self._split_by_paragraphs(text)
432        elif self._chunk_strategy == "fixed":
433            return self._split_fixed(text)
434        elif self._chunk_strategy == "semantic":
435            return self._split_semantic(text)
436        elif self._chunk_strategy == "custom":
437            return self._custom_split_func(text, self._chunk_size, self._chunk_overlap)
438        else:
439            raise ValueError(f"Unsupported chunk strategy: {self._chunk_strategy}")
440    
441    def _split_by_words(self, text: str) -> List[str]:
442        """
443        Split text by word boundaries while respecting word count.
444        """
445        words = text.split()
446        
447        if len(words) <= self._chunk_size:
448            return [text]
449        
450        chunks = []
451        start_word = 0
452        
453        while start_word < len(words):
454            end_word = start_word + self._chunk_size
455            chunk_words = words[start_word:end_word]
456            chunk = ' '.join(chunk_words)
457            
458            if chunk.strip():
459                chunks.append(chunk)
460            
461            new_start_word = end_word - self._chunk_overlap
462            if new_start_word <= start_word:
463                new_start_word = start_word + 1
464            start_word = new_start_word
465            
466            if start_word >= len(words):
467                break
468        
469        return chunks
470    
471    def _split_by_sentences(self, text: str) -> List[str]:
472        """
473        Split text by sentence boundaries while respecting sentence count.
474        """
475        sentence_endings = ['.', '!', '?', '\n\n']
476        
477        sentences = []
478        last_pos = 0
479        
480        for i, char in enumerate(text):
481            if char in sentence_endings:
482                sentence = text[last_pos:i+1].strip()
483                if sentence:
484                    sentences.append(sentence)
485                last_pos = i + 1
486        
487        if last_pos < len(text):
488            last_sentence = text[last_pos:].strip()
489            if last_sentence:
490                sentences.append(last_sentence)
491        
492        if len(sentences) <= self._chunk_size:
493            return [text]
494        
495        chunks = []
496        start_sentence = 0
497        
498        while start_sentence < len(sentences):
499            end_sentence = start_sentence + self._chunk_size
500            chunk_sentences = sentences[start_sentence:end_sentence]
501            chunk = ' '.join(chunk_sentences)
502            
503            if chunk.strip():
504                chunks.append(chunk)
505            
506            new_start_sentence = end_sentence - self._chunk_overlap
507            if new_start_sentence <= start_sentence:
508                new_start_sentence = start_sentence + 1
509            start_sentence = new_start_sentence
510            
511            if start_sentence >= len(sentences):
512                break
513        
514        return chunks
515    
516    def _split_by_paragraphs(self, text: str) -> List[str]:
517        """
518        Split text by paragraph boundaries while respecting paragraph count.
519        """
520        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
521        
522        if len(paragraphs) <= self._chunk_size:
523            return [text]
524        
525        chunks = []
526        start_paragraph = 0
527        
528        while start_paragraph < len(paragraphs):
529            end_paragraph = start_paragraph + self._chunk_size
530            chunk_paragraphs = paragraphs[start_paragraph:end_paragraph]
531            chunk = '\n\n'.join(chunk_paragraphs)
532            
533            if chunk.strip():
534                chunks.append(chunk)
535            
536            new_start_paragraph = end_paragraph - self._chunk_overlap
537            if new_start_paragraph <= start_paragraph:
538                new_start_paragraph = start_paragraph + 1
539            start_paragraph = new_start_paragraph
540            
541            if start_paragraph >= len(paragraphs):
542                break
543        
544        return chunks
545    
546    def _split_fixed(self, text: str) -> List[str]:
547        """
548        Split text into fixed-size chunks without considering boundaries.
549        """
550        chunks = []
551        start = 0
552        
553        while start < len(text):
554            end = start + self._chunk_size
555            chunk = text[start:end].strip()
556            
557            if chunk:
558                chunks.append(chunk)
559            
560            new_start = end - self._chunk_overlap
561            if new_start <= start:
562                new_start = start + 1
563            start = new_start
564            
565            if start >= len(text):
566                break
567        
568        return chunks
569    
570    def _split_semantic(self, text: str) -> List[str]:
571        """
572        Split text by semantic boundaries.
573        """
574        semantic_patterns = [
575            '\n# ', '\n## ', '\n### ', '\n#### ',  # Markdown headers
576            '\n1. ', '\n2. ', '\n3. ', '\n4. ', '\n5. ',  # Numbered lists
577            '\n• ', '\n- ', '\n* ',  # Bullet points
578            '\n\n',  # Paragraph breaks
579            '\n---\n', '\n___\n',  # Horizontal rules
580            '\n\nChapter ', '\n\nSection ', '\n\nPart ',  # Document sections
581        ]
582        
583        chunks = []
584        current_chunk = ""
585        
586        parts = [text]
587        for pattern in semantic_patterns:
588            new_parts = []
589            for part in parts:
590                if pattern in part:
591                    split_parts = part.split(pattern)
592                    for i, split_part in enumerate(split_parts):
593                        if i > 0:
594                            split_part = pattern + split_part
595                        if split_part.strip():
596                            new_parts.append(split_part)
597                else:
598                    new_parts.append(part)
599            parts = new_parts
600        
601        for part in parts:
602            if len(current_chunk) + len(part) > self._chunk_size and current_chunk:
603                chunks.append(current_chunk.strip())
604                overlap_start = max(0, len(current_chunk) - self._chunk_overlap)
605                current_chunk = current_chunk[overlap_start:] + part
606            else:
607                current_chunk += part
608        
609        if current_chunk.strip():
610            chunks.append(current_chunk.strip())
611        
612        return chunks

A utility class for building document collections from various sources.

This class provides methods to extract text content from files and web pages, split the content into manageable chunks with configurable size and overlap, and prepare the data for storage in vector databases.

The DocumentsBuilder is designed to work seamlessly with the RAG system, producing output that can be directly used with vector database operations.

Features:

  • File-based document extraction with UTF-8 encoding support
  • Text string processing for in-memory content
  • Web scraping with multiple engine options (requests, tavily, selenium)
  • Word document extraction (.doc and .docx formats)
  • PDF document extraction with metadata
  • Multiple chunking strategies (word, sentence, paragraph, fixed, semantic)
  • Configurable chunk size and overlap parameters
  • Rich metadata generation for each document chunk
  • Unique ID generation for database storage

Attributes: _chunk_strategy (str): The chunking strategy to use _chunk_size (int): Maximum size of each text chunk in characters _chunk_overlap (int): Number of characters to overlap between chunks

DocumentsBuilder( chunk_strategy: str = 'word', chunk_size: int = 1000, chunk_overlap: int = 0, custom_split_func: Optional[<built-in function callable>] = None)
42    def __init__(
43        self,
44        chunk_strategy: str = "word",
45        chunk_size: int = 1000,
46        chunk_overlap: int = 0,
47        custom_split_func: Optional[callable] = None
48    ):
49        """
50        Initialize the DocumentsBuilder with chunking parameters.
51        """
52        # If custom_split_func is provided, automatically set strategy to "custom"
53        if custom_split_func is not None:
54            chunk_strategy = "custom"
55        
56        self._chunk_strategy = chunk_strategy
57        self._chunk_size = chunk_size
58        self._chunk_overlap = chunk_overlap
59        self._custom_split_func = custom_split_func
60
61        self._module_cache: Dict[str, object] = {}
62        
63        if chunk_overlap >= chunk_size:
64            raise ValueError(
65                f"chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size}) "
66                "to prevent infinite loops. Recommended: chunk_overlap should be 10-20% of chunk_size."
67            )
68        
69        if chunk_size <= 0:
70            raise ValueError(f"chunk_size must be positive, got {chunk_size}")
71        
72        if chunk_overlap < 0:
73            raise ValueError(f"chunk_overlap must be non-negative, got {chunk_overlap}")
74        
75        if chunk_strategy == "custom" and custom_split_func is None:
76            raise ValueError("custom_split_func must be provided when chunk_strategy='custom'")
77        
78        if custom_split_func is not None and not callable(custom_split_func):
79            raise ValueError("custom_split_func must be callable")

Initialize the DocumentsBuilder with chunking parameters.

def from_file(self, file_path: str) -> Tuple[List[str], List[Dict], List[str]]:
102    def from_file(self, file_path: str) -> Tuple[List[str], List[Dict], List[str]]:
103        """
104        Read a file and split it into chunks with specified size and overlap.
105        """
106        if not os.path.exists(file_path):
107            raise FileNotFoundError(f"File not found: {file_path}")
108        
109        with open(file_path, 'r', encoding='utf-8') as file:
110            text = file.read()
111        
112        chunks = self._split_text(text)
113        
114        documents = []
115        metadatas = []
116        ids = []
117        
118        for i, chunk in enumerate(chunks):
119            chunk_id = str(uuid.uuid4())
120            metadata = {
121                'file_path': file_path,
122                'file_name': os.path.basename(file_path),
123                'chunk_index': i,
124                'total_chunks': len(chunks),
125                'chunk_size': len(chunk)
126            }
127            documents.append(chunk)
128            metadatas.append(metadata)
129            ids.append(chunk_id)
130        
131        return documents, metadatas, ids

Read a file and split it into chunks with specified size and overlap.

def from_str( self, text: str, source_name: str = 'text_string') -> Tuple[List[str], List[Dict], List[str]]:
133    def from_str(self, text: str, source_name: str = "text_string") -> Tuple[List[str], List[Dict], List[str]]:
134        """
135        Process a text string and split it into chunks with specified size and overlap.
136        """
137        if not text or not text.strip():
138            return [], [], []
139        
140        chunks = self._split_text(text)
141        
142        documents = []
143        metadatas = []
144        ids = []
145        
146        for i, chunk in enumerate(chunks):
147            chunk_id = str(uuid.uuid4())
148            metadata = {
149                'source_type': 'text_string',
150                'source_name': source_name,
151                'chunk_index': i,
152                'total_chunks': len(chunks),
153                'chunk_size': len(chunk),
154                'chunk_strategy': self._chunk_strategy
155            }
156            documents.append(chunk)
157            metadatas.append(metadata)
158            ids.append(chunk_id)
159        
160        return documents, metadatas, ids

Process a text string and split it into chunks with specified size and overlap.

def from_doc( self, file_path: str, extraction_method: str = 'auto') -> Tuple[List[str], List[Dict], List[str]]:
162    def from_doc(self, file_path: str, extraction_method: str = "auto") -> Tuple[List[str], List[Dict], List[str]]:
163        """
164        Extract text from Word documents (.doc and .docx files) and split into chunks.
165        """
166        if not os.path.exists(file_path):
167            raise FileNotFoundError(f"File not found: {file_path}")
168        
169        file_extension = os.path.splitext(file_path)[1].lower()
170        if file_extension not in ['.doc', '.docx']:
171            raise ValueError(f"Unsupported file format: {file_extension}. Only .doc and .docx files are supported.")
172        
173        # Selezione metodo al volo con lazy detection
174        if extraction_method == "auto":
175            if file_extension == '.docx' and self._has_module("docx"):
176                extraction_method = "docx"
177            elif self._has_module("docx2txt"):
178                extraction_method = "docx2txt"
179            else:
180                raise ImportError(
181                    "docx2txt is required for .docx file extraction. Install with: pip install docx2txt"
182                )
183        
184        if extraction_method == "docx":
185            if file_extension != '.docx':
186                raise ValueError("'docx' extraction method only supports .docx files")
187            text, doc_properties = self._extract_with_docx(file_path)
188        elif extraction_method == "docx2txt":
189            text, doc_properties = self._extract_with_docx2txt(file_path)
190        else:
191            raise ValueError(f"Unsupported extraction method: {extraction_method}")
192        
193        chunks = self._split_text(text)
194        
195        documents = []
196        metadatas = []
197        ids = []
198        
199        for i, chunk in enumerate(chunks):
200            chunk_id = str(uuid.uuid4())
201            metadata = {
202                'file_path': file_path,
203                'file_name': os.path.basename(file_path),
204                'document_format': file_extension[1:],  # Remove the dot
205                'extraction_method': extraction_method,
206                'chunk_index': i,
207                'total_chunks': len(chunks),
208                'chunk_size': len(chunk)
209            }
210            if doc_properties:
211                metadata.update(doc_properties)
212            
213            documents.append(chunk)
214            metadatas.append(metadata)
215            ids.append(chunk_id)
216        
217        return documents, metadatas, ids

Extract text from Word documents (.doc and .docx files) and split into chunks.

def from_pdf( self, file_path: str, page_range: Optional[Tuple[int, int]] = None) -> Tuple[List[str], List[Dict], List[str]]:
219    def from_pdf(self, file_path: str, page_range: Optional[Tuple[int, int]] = None) -> Tuple[List[str], List[Dict], List[str]]:
220        """
221        Extract text from PDF documents and split into chunks.
222        """
223        if not os.path.exists(file_path):
224            raise FileNotFoundError(f"File not found: {file_path}")
225        
226        file_extension = os.path.splitext(file_path)[1].lower()
227        if file_extension != '.pdf':
228            raise ValueError(f"Unsupported file format: {file_extension}. Only .pdf files are supported.")
229        
230        # qui PyPDF2 viene importato solo se davvero richiesto
231        if not self._has_module("PyPDF2"):
232            raise ImportError("PyPDF2 is required for PDF file extraction. Install with: pip install PyPDF2")
233        
234        text, pdf_properties, page_info = self._extract_from_pdf(file_path, page_range)
235        
236        chunks = self._split_text(text)
237        
238        documents = []
239        metadatas = []
240        ids = []
241        
242        for i, chunk in enumerate(chunks):
243            chunk_id = str(uuid.uuid4())
244            metadata = {
245                'file_path': file_path,
246                'file_name': os.path.basename(file_path),
247                'document_format': 'pdf',
248                'chunk_index': i,
249                'total_chunks': len(chunks),
250                'chunk_size': len(chunk)
251            }
252            if pdf_properties:
253                metadata.update(pdf_properties)
254            if page_info:
255                metadata.update(page_info)
256            
257            documents.append(chunk)
258            metadatas.append(metadata)
259            ids.append(chunk_id)
260        
261        return documents, metadatas, ids

Extract text from PDF documents and split into chunks.

def from_url( self, url: str, engine: str = 'requests', deep: bool = False) -> Tuple[List[str], List[Dict], List[str]]:
263    def from_url(self, url: str, engine: str = "requests", deep: bool = False) -> Tuple[List[str], List[Dict], List[str]]:
264        """
265        Scrape content from a URL and split it into chunks with specified size and overlap.
266        
267        Notes:
268        - Scraping may take time depending on the engine and website complexity
269        - Some websites may block automated scraping
270        - Selenium requires Chrome/Chromium to be installed
271        - Tavily requires an API key to be configured
272        """
273        from monoai.tools.webscraping import scrape_web
274        result = scrape_web(url, engine=engine, deep=deep)
275                
276        if not result or not result.get("text"):
277            raise ValueError(f"Failed to extract text content from URL: {url}")
278        
279        text = result["text"]
280        chunks = self._split_text(text)
281        
282        documents = []
283        metadatas = []
284        ids = []
285        
286        for i, chunk in enumerate(chunks):
287            chunk_id = str(uuid.uuid4())
288            metadata = {
289                'url': url,
290                'source_type': 'web_page',
291                'scraping_engine': engine,
292                'deep_extraction': deep,
293                'chunk_index': i,
294                'total_chunks': len(chunks),
295                'chunk_size': len(chunk)
296            }
297            documents.append(chunk)
298            metadatas.append(metadata)
299            ids.append(chunk_id)
300        
301        return documents, metadatas, ids

Scrape content from a URL and split it into chunks with specified size and overlap.

Notes:

  • Scraping may take time depending on the engine and website complexity
  • Some websites may block automated scraping
  • Selenium requires Chrome/Chromium to be installed
  • Tavily requires an API key to be configured