Filtered semantic search

Filtered semantic search combines vector similarity with metadata conditions. Results must be both semantically similar to your query and match your filter criteria. Use filtered search to narrow results by category, topic, date range, or other metadata attributes. VectorAI DB evaluates filters during search (pre-filtering), which is more efficient than filtering results after retrieval. Before running these examples, make sure you have a VectorAI DB instance running at localhost:6574 and the relevant SDK installed. For setup instructions, see Docker installation.

Keyword filter

Filter results by a specific keyword field value, such as topic or category:

from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    Field,
    FieldType,
    FilterBuilder,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:6574"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Filtered semantic search ────────────────────────
        print(fmt.format("Semantic + filter: topic='ml'"))
        query_vec = fake_embed("how do vector databases work?")
        f = FilterBuilder().must(Field("topic").eq("ml")).build()
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            filter=f,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | [{r.payload['topic']}] {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()

import { VectorAIClient, Field } from '@actian/vectorai-client';

const SERVER = 'localhost:6574';
const COLLECTION = 'semantic_demo';
const DIM = 64;

// Simulated document corpus
const DOCUMENTS = [
  { id: 1, text: 'Python is a popular programming language', topic: 'programming', year: 2024 },
  { id: 2, text: 'Machine learning transforms data into insights', topic: 'ml', year: 2024 },
  { id: 3, text: 'Vector databases enable semantic search', topic: 'databases', year: 2024 },
  { id: 4, text: 'Neural networks learn hierarchical features', topic: 'ml', year: 2023 },
  { id: 5, text: 'SQL is the language of relational databases', topic: 'databases', year: 2020 },
  { id: 6, text: 'Deep learning requires large datasets', topic: 'ml', year: 2023 },
  { id: 7, text: 'Graph databases model relationships', topic: 'databases', year: 2022 },
  { id: 8, text: 'Transformers revolutionized NLP', topic: 'ml', year: 2023 },
  { id: 9, text: 'Rust is a memory-safe systems language', topic: 'programming', year: 2024 },
  { id: 10, text: 'Embeddings represent meaning as vectors', topic: 'ml', year: 2024 },
];

/** Deterministic pseudo-embedding based on text hash. */
function fakeEmbed(text, dim = DIM) {
  let hash = 0;
  for (let i = 0; i < text.length; i++) {
    hash = (hash * 31 + text.charCodeAt(i)) | 0;
  }
  const seed = Math.abs(hash);
  const vec = [];
  for (let i = 0; i < dim; i++) {
    const x = Math.sin(seed * (i + 1)) * 10000;
    vec.push(x - Math.floor(x));
  }
  return vec;
}

async function main() {
  const client = new VectorAIClient(SERVER);
  try {
    await client.collections.delete(COLLECTION).catch(() => {});
    await client.collections.create(COLLECTION, {
      dimension: DIM,
      distanceMetric: 'COSINE',
    });

    // Create field indexes for filtered search
    await client.points.createFieldIndex(COLLECTION, 'topic', { fieldType: 'KEYWORD' });
    await client.points.createFieldIndex(COLLECTION, 'year', { fieldType: 'INTEGER' });

    // Embed and insert documents
    const points = DOCUMENTS.map((doc) => ({
      id: doc.id,
      vector: fakeEmbed(doc.text),
      payload: { text: doc.text, topic: doc.topic, year: doc.year },
    }));
    await client.points.upsert(COLLECTION, points, { wait: true });
    console.log(`Indexed ${DOCUMENTS.length} documents`);

    // -- Filtered semantic search --
    console.log("\n=== Semantic + filter: topic='ml' ===");
    const queryVec = fakeEmbed('how do vector databases work?');
    const results = await client.points.search(COLLECTION, queryVec, {
      filter: new Field('topic').eq('ml'),
      limit: 5,
      withPayload: true,
    });
    for (const r of results) {
      console.log(`  score=${r.score.toFixed(4)} | [${r.payload.topic}] ${r.payload.text}`);
    }

    // Cleanup
    await client.collections.delete(COLLECTION);
    console.log('\nCleaned up');
  } finally {
    client.close();
  }
}

main().catch(console.error);

The filter requires the topic field to equal "ml". In Python, use FilterBuilder with .must() to construct the condition. In JavaScript, use new Field('topic').eq('ml') directly. Only documents matching this condition are considered during vector similarity search.

Range filter

Filter results by a numeric range, such as documents from a specific year onward:

from __future__ import annotations

import random

from actian_vectorai import (
    Distance,
    Field,
    FieldType,
    FilterBuilder,
    PointStruct,
    VectorAIClient,
    VectorParams,
)

SERVER = "localhost:6574"
COLLECTION = "semantic_demo"
DIM = 64
fmt = "\n=== {:50} ==="

# Simulated document corpus
DOCUMENTS = [
    {
        "id": 1,
        "text": "Python is a popular programming language",
        "topic": "programming",
        "year": 2024,
    },
    {
        "id": 2,
        "text": "Machine learning transforms data into insights",
        "topic": "ml",
        "year": 2024,
    },
    {
        "id": 3,
        "text": "Vector databases enable semantic search",
        "topic": "databases",
        "year": 2024,
    },
    {"id": 4, "text": "Neural networks learn hierarchical features", "topic": "ml", "year": 2023},
    {
        "id": 5,
        "text": "SQL is the language of relational databases",
        "topic": "databases",
        "year": 2020,
    },
    {"id": 6, "text": "Deep learning requires large datasets", "topic": "ml", "year": 2023},
    {"id": 7, "text": "Graph databases model relationships", "topic": "databases", "year": 2022},
    {"id": 8, "text": "Transformers revolutionized NLP", "topic": "ml", "year": 2023},
    {
        "id": 9,
        "text": "Rust is a memory-safe systems language",
        "topic": "programming",
        "year": 2024,
    },
    {"id": 10, "text": "Embeddings represent meaning as vectors", "topic": "ml", "year": 2024},
]


def fake_embed(text: str, dim: int = DIM) -> list[float]:
    """Deterministic pseudo-embedding based on text hash."""
    random.seed(hash(text) % (2**32))
    return [random.gauss(0, 1) for _ in range(dim)]


def main() -> None:
    with VectorAIClient(SERVER) as client:
        if client.collections.exists(COLLECTION):
            client.collections.delete(COLLECTION)
        client.collections.create(
            COLLECTION,
            vectors_config=VectorParams(size=DIM, distance=Distance.Cosine),
        )

        # Create field indexes for filtered search
        client.points.create_field_index(COLLECTION, "topic", FieldType.FieldTypeKeyword)
        client.points.create_field_index(COLLECTION, "year", FieldType.FieldTypeInteger)

        # Embed and insert documents
        points = [
            PointStruct(
                id=doc["id"],
                vector=fake_embed(doc["text"]),
                payload={"text": doc["text"], "topic": doc["topic"], "year": doc["year"]},
            )
            for doc in DOCUMENTS
        ]
        client.points.upsert(COLLECTION, points)
        print(f"✓ Indexed {len(DOCUMENTS)} documents")

        # ── Range-filtered semantic search ──────────────────
        print(fmt.format("Semantic + filter: year >= 2023"))
        query_vec = fake_embed("how do vector databases work?")
        f = FilterBuilder().must(Field("year").gte(2023)).build()
        results = client.points.search(
            COLLECTION,
            vector=query_vec,
            filter=f,
            limit=5,
            with_payload=True,
        )
        for r in results:
            print(f"  score={r.score:.4f} | [{r.payload['year']}] {r.payload['text']}")

        # Cleanup
        client.collections.delete(COLLECTION)
        print("\n✓ Cleaned up")


if __name__ == "__main__":
    main()

import { VectorAIClient, Field } from '@actian/vectorai-client';

const SERVER = 'localhost:6574';
const COLLECTION = 'semantic_demo';
const DIM = 64;

// Simulated document corpus
const DOCUMENTS = [
  { id: 1, text: 'Python is a popular programming language', topic: 'programming', year: 2024 },
  { id: 2, text: 'Machine learning transforms data into insights', topic: 'ml', year: 2024 },
  { id: 3, text: 'Vector databases enable semantic search', topic: 'databases', year: 2024 },
  { id: 4, text: 'Neural networks learn hierarchical features', topic: 'ml', year: 2023 },
  { id: 5, text: 'SQL is the language of relational databases', topic: 'databases', year: 2020 },
  { id: 6, text: 'Deep learning requires large datasets', topic: 'ml', year: 2023 },
  { id: 7, text: 'Graph databases model relationships', topic: 'databases', year: 2022 },
  { id: 8, text: 'Transformers revolutionized NLP', topic: 'ml', year: 2023 },
  { id: 9, text: 'Rust is a memory-safe systems language', topic: 'programming', year: 2024 },
  { id: 10, text: 'Embeddings represent meaning as vectors', topic: 'ml', year: 2024 },
];

/** Deterministic pseudo-embedding based on text hash. */
function fakeEmbed(text, dim = DIM) {
  let hash = 0;
  for (let i = 0; i < text.length; i++) {
    hash = (hash * 31 + text.charCodeAt(i)) | 0;
  }
  const seed = Math.abs(hash);
  const vec = [];
  for (let i = 0; i < dim; i++) {
    const x = Math.sin(seed * (i + 1)) * 10000;
    vec.push(x - Math.floor(x));
  }
  return vec;
}

async function main() {
  const client = new VectorAIClient(SERVER);
  try {
    await client.collections.delete(COLLECTION).catch(() => {});
    await client.collections.create(COLLECTION, {
      dimension: DIM,
      distanceMetric: 'COSINE',
    });

    // Create field indexes for filtered search
    await client.points.createFieldIndex(COLLECTION, 'topic', { fieldType: 'KEYWORD' });
    await client.points.createFieldIndex(COLLECTION, 'year', { fieldType: 'INTEGER' });

    // Embed and insert documents
    const points = DOCUMENTS.map((doc) => ({
      id: doc.id,
      vector: fakeEmbed(doc.text),
      payload: { text: doc.text, topic: doc.topic, year: doc.year },
    }));
    await client.points.upsert(COLLECTION, points, { wait: true });
    console.log(`Indexed ${DOCUMENTS.length} documents`);

    // -- Range-filtered semantic search --
    console.log('\n=== Semantic + filter: year >= 2023 ===');
    const queryVec = fakeEmbed('how do vector databases work?');
    const results = await client.points.search(COLLECTION, queryVec, {
      filter: new Field('year').gte(2023),
      limit: 5,
      withPayload: true,
    });
    for (const r of results) {
      console.log(`  score=${r.score.toFixed(4)} | [${r.payload.year}] ${r.payload.text}`);
    }

    // Cleanup
    await client.collections.delete(COLLECTION);
    console.log('\nCleaned up');
  } finally {
    client.close();
  }
}

main().catch(console.error);

The gte operator on the year field restricts results to documents from 2023 onward. VectorAI DB evaluates this condition during search, so only qualifying documents are compared by vector similarity. Each result includes these fields:

id: The unique identifier of the matching document
score: Similarity score for documents that passed the filter
payload: Metadata object containing the filtered attributes

Create field indexes before running filtered searches (create_field_index in Python, createFieldIndex in JavaScript). Without indexes, VectorAI DB scans all points to evaluate filter conditions, which reduces performance. For the full filter syntax, see Filtering.

Collections

Points

Vectors

Payload

Search

Filtering

Semantic search

Hybrid search

Distance metrics

Indexing

Filtered semantic search

Keyword filter

Range filter

​Keyword filter

​Range filter

Keyword filter

Range filter