Pure semantic search

Pure semantic search retrieves the most similar documents to your query using only vector similarity. No metadata filters are applied — all documents in the collection are candidates. Use pure semantic search when you want broad retrieval across your entire document corpus without restricting results by category, date, or other metadata. Before running this example, make sure you have a VectorAI DB instance running at localhost:6574 and the relevant SDK installed. For setup instructions, see Docker installation. This example demonstrates the core semantic search pipeline:

Create a collection with cosine distance and a vector dimension matching your embeddings.
Create field indexes on payload fields you plan to filter on later.
Embed and upsert documents with their text, vector, and metadata payload.
Search with a query vector and retrieve the top-K most similar results with their payload.

Each result includes these fields:

id: The unique identifier of the matching document
score: Cosine similarity score. Higher values indicate greater semantic similarity.
payload: Metadata object containing the document text and attributes

from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    FieldType,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:6574"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Pure semantic search ────────────────────────────
        print(fmt.format("Semantic: 'how do vector databases work?'"))
        query_vec = fake_embed("how do vector databases work?")
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()

import { VectorAIClient } from '@actian/vectorai-client';

const SERVER = 'localhost:6574';
const COLLECTION = 'semantic_demo';
const DIM = 64;

// Simulated document corpus
const DOCUMENTS = [
  { id: 1, text: 'Python is a popular programming language', topic: 'programming', year: 2024 },
  { id: 2, text: 'Machine learning transforms data into insights', topic: 'ml', year: 2024 },
  { id: 3, text: 'Vector databases enable semantic search', topic: 'databases', year: 2024 },
  { id: 4, text: 'Neural networks learn hierarchical features', topic: 'ml', year: 2023 },
  { id: 5, text: 'SQL is the language of relational databases', topic: 'databases', year: 2020 },
  { id: 6, text: 'Deep learning requires large datasets', topic: 'ml', year: 2023 },
  { id: 7, text: 'Graph databases model relationships', topic: 'databases', year: 2022 },
  { id: 8, text: 'Transformers revolutionized NLP', topic: 'ml', year: 2023 },
  { id: 9, text: 'Rust is a memory-safe systems language', topic: 'programming', year: 2024 },
  { id: 10, text: 'Embeddings represent meaning as vectors', topic: 'ml', year: 2024 },
];

/** Deterministic pseudo-embedding based on text hash. */
function fakeEmbed(text, dim = DIM) {
  let hash = 0;
  for (let i = 0; i < text.length; i++) {
    hash = (hash * 31 + text.charCodeAt(i)) | 0;
  }
  const seed = Math.abs(hash);
  const vec = [];
  for (let i = 0; i < dim; i++) {
    const x = Math.sin(seed * (i + 1)) * 10000;
    vec.push(x - Math.floor(x));
  }
  return vec;
}

async function main() {
  const client = new VectorAIClient(SERVER);
  try {
    await client.collections.delete(COLLECTION).catch(() => {});
    await client.collections.create(COLLECTION, {
      dimension: DIM,
      distanceMetric: 'COSINE',
    });

    // Create field indexes for filtered search
    await client.points.createFieldIndex(COLLECTION, 'topic', { fieldType: 'KEYWORD' });
    await client.points.createFieldIndex(COLLECTION, 'year', { fieldType: 'INTEGER' });

    // Embed and insert documents
    const points = DOCUMENTS.map((doc) => ({
      id: doc.id,
      vector: fakeEmbed(doc.text),
      payload: { text: doc.text, topic: doc.topic, year: doc.year },
    }));
    await client.points.upsert(COLLECTION, points, { wait: true });
    console.log(`Indexed ${DOCUMENTS.length} documents`);

    // -- Pure semantic search --
    console.log("\n=== Semantic: 'how do vector databases work?' ===");
    const queryVec = fakeEmbed('how do vector databases work?');
    const results = await client.points.search(COLLECTION, queryVec, {
      limit: 5,
      withPayload: true,
    });
    for (const r of results) {
      console.log(`  score=${r.score.toFixed(4)} | ${r.payload.text}`);
    }

    // Cleanup
    await client.collections.delete(COLLECTION);
    console.log('\nCleaned up');
  } finally {
    client.close();
  }
}

main().catch(console.error);

In production, replace the placeholder embedding function (fake_embed in Python, fakeEmbed in JavaScript) with a real embedding model such as OpenAI, Cohere, or an open-source model like Sentence Transformers. Use the same model for both indexing and querying.

Collections

Points

Vectors

Payload

Search

Filtering

Semantic search

Hybrid search

Distance metrics

Indexing