Skip to main content
Score threshold search returns only results that meet a minimum similarity score. This ensures that all returned documents have a meaningful semantic relationship to your query. Use score thresholds when result quality matters more than result count. Without a threshold, search always returns up to limit results regardless of how relevant they are. A threshold discards low-confidence matches early, reducing noise in downstream processing. Before running this example, make sure you have a VectorAI DB instance running at localhost:50051 and the relevant SDK installed. For setup instructions, see Docker installation.
from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    FieldType,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:50051"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Score threshold ─────────────────────────────────
        print(fmt.format("Semantic with score_threshold=0.5"))
        query_vec = fake_embed("how do vector databases work?")
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            limit=10,
            score_threshold=0.5,
            with_payload=True,
        )
        print(f"  {len(results)} results above threshold")
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()
The score threshold parameter (score_threshold in Python, scoreThreshold in JavaScript) filters out any result with a cosine similarity score below 0.5. The number of returned results may be less than limit if fewer documents meet the threshold. Each result includes these fields:
  • id: The unique identifier of the matching document
  • score: Similarity score guaranteed to be at or above the threshold
  • payload: Metadata object containing the document text and attributes

Choosing a threshold

The optimal threshold depends on your embedding model and data:
  • Higher thresholds (for example, 0.7 or above) return fewer, more precise results. Use this when false positives are costly.
  • Lower thresholds (for example, 0.3 to 0.5) return more results with broader recall. Use this when coverage matters more than precision.
  • No threshold returns exactly limit results regardless of quality. Use this when you always need a fixed number of results.
Experiment with different thresholds on your data to find the right balance. Threshold behavior varies across embedding models because different models produce different score distributions.