Skip to main content
This complete workflow demonstrates an end-to-end semantic search pipeline. It covers collection setup, document embedding, field indexing, and all major search strategies: pure semantic search, filtered search, range-filtered search, score threshold search, and multi-constraint search. This mirrors a real-world RAG (Retrieval-Augmented Generation) pipeline where you encode a query, search for similar documents, and filter by metadata. Before running this example, make sure you have a VectorAI DB instance running at localhost:50051 and the relevant SDK installed. For setup instructions, see Docker installation.
from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    Field,
    FieldType,
    FilterBuilder,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:50051"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Pure semantic search ────────────────────────────
        print(fmt.format("Semantic: 'how do vector databases work?'"))
        query_vec = fake_embed("how do vector databases work?")
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # ── Filtered semantic search ────────────────────────
        print(fmt.format("Semantic + filter: topic='ml'"))
        f = FilterBuilder().must(Field("topic").eq("ml")).build()
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            filter=f,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | [{r.payload['topic']}] {r.payload['text']}")

        # ── Range-filtered semantic search ──────────────────
        print(fmt.format("Semantic + filter: year >= 2023"))
        f = FilterBuilder().must(Field("year").gte(2023)).build()
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            filter=f,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | [{r.payload['year']}] {r.payload['text']}")

        # ── Score threshold ─────────────────────────────────
        print(fmt.format("Semantic with score_threshold=0.5"))
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            limit=10,
            score_threshold=0.5,
            with_payload=True,
        )
        print(f"  {len(results)} results above threshold")
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # ── Combined: multi-constraint ──────────────────────
        print(fmt.format("Multi-constraint: ml + year>=2024"))
        f = FilterBuilder().must(Field("topic").eq("ml")).must(Field("year").gte(2024)).build()
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            filter=f,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()
This workflow covers five search strategies:
  1. Pure semantic search — Retrieves the top five most similar documents with no filters applied. All documents are candidates.
  2. Keyword-filtered search — Restricts results to documents with topic="ml" while ranking by vector similarity.
  3. Range-filtered search — Restricts results to documents with year >= 2023 while ranking by vector similarity.
  4. Score threshold search — Returns only results with a cosine similarity score of 0.5 or higher. The result count may be less than limit.
  5. Multi-constraint search — Combines a keyword filter (topic="ml") with a range filter (year >= 2024). Both conditions must be true.

Key patterns

Keep these practices in mind when building your own semantic search pipeline.
  • Create field indexes before searching — Call create_field_index (Python) or createFieldIndex (JavaScript) for each payload field used in filters. This enables efficient filter evaluation during search.
  • Use the same embedding model for indexing and querying — Vector similarity is only meaningful when both sides use the same model.
  • Combine strategies as needed — Score thresholds and metadata filters can be used together for maximum precision.
In production, replace the placeholder embedding function (fake_embed in Python, fakeEmbed in JavaScript) with a real embedding model. For a ready-to-use example, see OpenAI embedding integration.