Skip to main content
Pure semantic search retrieves the most similar documents to your query using only vector similarity. No metadata filters are applied — all documents in the collection are candidates. Use pure semantic search when you want broad retrieval across your entire document corpus without restricting results by category, date, or other metadata. Before running this example, make sure you have a VectorAI DB instance running at localhost:50051 and the relevant SDK installed. For setup instructions, see Docker installation. This example demonstrates the core semantic search pipeline:
  1. Create a collection with cosine distance and a vector dimension matching your embeddings.
  2. Create field indexes on payload fields you plan to filter on later.
  3. Embed and upsert documents with their text, vector, and metadata payload.
  4. Search with a query vector and retrieve the top-K most similar results with their payload.
Each result includes these fields:
  • id: The unique identifier of the matching document
  • score: Cosine similarity score. Higher values indicate greater semantic similarity.
  • payload: Metadata object containing the document text and attributes
from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    FieldType,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:50051"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Pure semantic search ────────────────────────────
        print(fmt.format("Semantic: 'how do vector databases work?'"))
        query_vec = fake_embed("how do vector databases work?")
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()
In production, replace the placeholder embedding function (fake_embed in Python, fakeEmbed in JavaScript) with a real embedding model such as OpenAI, Cohere, or an open-source model like Sentence Transformers. Use the same model for both indexing and querying.