Score threshold search

Score threshold search returns only results that meet a minimum similarity score. This ensures that all returned documents have a meaningful semantic relationship to your query. Use score thresholds when result quality matters more than result count. Without a threshold, search always returns up to limit results regardless of how relevant they are. A threshold discards low-confidence matches early, reducing noise in downstream processing. Before running this example, make sure you have a VectorAI DB instance running at localhost:6574 and the relevant SDK installed. For setup instructions, see Docker installation.

from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    FieldType,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:6574"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Score threshold ─────────────────────────────────
        print(fmt.format("Semantic with score_threshold=0.5"))
        query_vec = fake_embed("how do vector databases work?")
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            limit=10,
            score_threshold=0.5,
            with_payload=True,
        )
        print(f"  {len(results)} results above threshold")
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()

import { VectorAIClient } from '@actian/vectorai-client';

const SERVER = 'localhost:6574';
const COLLECTION = 'semantic_demo';
const DIM = 64;

// Simulated document corpus
const DOCUMENTS = [
  { id: 1, text: 'Python is a popular programming language', topic: 'programming', year: 2024 },
  { id: 2, text: 'Machine learning transforms data into insights', topic: 'ml', year: 2024 },
  { id: 3, text: 'Vector databases enable semantic search', topic: 'databases', year: 2024 },
  { id: 4, text: 'Neural networks learn hierarchical features', topic: 'ml', year: 2023 },
  { id: 5, text: 'SQL is the language of relational databases', topic: 'databases', year: 2020 },
  { id: 6, text: 'Deep learning requires large datasets', topic: 'ml', year: 2023 },
  { id: 7, text: 'Graph databases model relationships', topic: 'databases', year: 2022 },
  { id: 8, text: 'Transformers revolutionized NLP', topic: 'ml', year: 2023 },
  { id: 9, text: 'Rust is a memory-safe systems language', topic: 'programming', year: 2024 },
  { id: 10, text: 'Embeddings represent meaning as vectors', topic: 'ml', year: 2024 },
];

/** Deterministic pseudo-embedding based on text hash. */
function fakeEmbed(text, dim = DIM) {
  let hash = 0;
  for (let i = 0; i < text.length; i++) {
    hash = (hash * 31 + text.charCodeAt(i)) | 0;
  }
  const seed = Math.abs(hash);
  const vec = [];
  for (let i = 0; i < dim; i++) {
    const x = Math.sin(seed * (i + 1)) * 10000;
    vec.push(x - Math.floor(x));
  }
  return vec;
}

async function main() {
  const client = new VectorAIClient(SERVER);
  try {
    await client.collections.delete(COLLECTION).catch(() => {});
    await client.collections.create(COLLECTION, {
      dimension: DIM,
      distanceMetric: 'COSINE',
    });

    // Create field indexes for filtered search
    await client.points.createFieldIndex(COLLECTION, 'topic', { fieldType: 'KEYWORD' });
    await client.points.createFieldIndex(COLLECTION, 'year', { fieldType: 'INTEGER' });

    // Embed and insert documents
    const points = DOCUMENTS.map((doc) => ({
      id: doc.id,
      vector: fakeEmbed(doc.text),
      payload: { text: doc.text, topic: doc.topic, year: doc.year },
    }));
    await client.points.upsert(COLLECTION, points, { wait: true });
    console.log(`Indexed ${DOCUMENTS.length} documents`);

    // -- Score threshold --
    console.log('\n=== Semantic with scoreThreshold=0.5 ===');
    const queryVec = fakeEmbed('how do vector databases work?');
    const results = await client.points.search(COLLECTION, queryVec, {
      limit: 10,
      scoreThreshold: 0.5,
      withPayload: true,
    });
    console.log(`  ${results.length} results above threshold`);
    for (const r of results) {
      console.log(`  score=${r.score.toFixed(4)} | ${r.payload.text}`);
    }

    // Cleanup
    await client.collections.delete(COLLECTION);
    console.log('\nCleaned up');
  } finally {
    client.close();
  }
}

main().catch(console.error);

The score threshold parameter (score_threshold in Python, scoreThreshold in JavaScript) filters out any result with a cosine similarity score below 0.5. The number of returned results may be less than limit if fewer documents meet the threshold. Each result includes these fields:

id: The unique identifier of the matching document
score: Similarity score guaranteed to be at or above the threshold
payload: Metadata object containing the document text and attributes

Choosing a threshold

The optimal threshold depends on your embedding model and data:

Higher thresholds (for example, 0.7 or above) return fewer, more precise results. Use this when false positives are costly.
Lower thresholds (for example, 0.3 to 0.5) return more results with broader recall. Use this when coverage matters more than precision.
No threshold returns exactly limit results regardless of quality. Use this when you always need a fixed number of results.

Experiment with different thresholds on your data to find the right balance. Threshold behavior varies across embedding models because different models produce different score distributions.

Collections

Points

Vectors

Payload

Search

Filtering

Semantic search

Hybrid search

Distance metrics

Indexing

Score threshold search

Choosing a threshold

​Choosing a threshold

Choosing a threshold