Multiple Tokenizers Per Field

In many cases, a text field needs to be tokenized multiple ways. For instance, using the unicode tokenizer for search, and the literal tokenizer for Top K ordering. To tokenize a field in more than one way, append an alias=<alias_name> argument to the additional tokenizer configurations. The alias name can be any string you like. For instance, the following statement tokenizes description using both the simple and literal tokenizers.

CREATE INDEX search_idx ON mock_items
USING bm25 (
  id,
  (description::pdb.literal),
  (description::pdb.simple('alias=description_simple'))
) WITH (key_field='id');

Under the hood, two distinct fields are created in the index: a field called description, which uses the literal tokenizer, and an aliased field called description_simple, which uses the simple tokenizer. To query against the aliased field, cast it to pdb.alias('alias_name'):

-- Query against `description_simple`
SELECT description, rating, category
FROM mock_items
WHERE description::pdb.alias('description_simple') ||| 'Sleek running shoes';

-- Query against `description`
SELECT description, rating, category
FROM mock_items
WHERE description ||| 'Sleek running shoes';

import { search } from "@paradedb/drizzle-paradedb";

// Query against `description_simple`
await db
  .select({
    description: mockItems.description,
    rating: mockItems.rating,
    category: mockItems.category,
  })
  .from(mockItems)
  .where(
    search.matchAny(
      search.alias(mockItems.description, "description_simple"),
      "Sleek running shoes",
    ),
  );

// Query against `description`
await db
  .select({
    description: mockItems.description,
    rating: mockItems.rating,
    category: mockItems.category,
  })
  .from(mockItems)
  .where(search.matchAny(mockItems.description, "Sleek running shoes"));

from paradedb import MatchAny, ParadeDB

# Query against `description_simple`
MockItem.objects.extra(
    where=["(description::pdb.alias('description_simple')) ||| 'Sleek running shoes'"]
).values('description', 'rating', 'category')

# Query against `description`
MockItem.objects.filter(
    description=ParadeDB(MatchAny('Sleek running shoes'))
).values('description', 'rating', 'category')

from sqlalchemy import select
from sqlalchemy.orm import Session
from paradedb.sqlalchemy import pdb, search

# Query against `description_simple`
stmt_alias = (
    select(MockItem.description, MockItem.rating, MockItem.category)
    .where(search.match_any(pdb.alias(MockItem.description, "description_simple"), "Sleek running shoes"))
)

# Query against `description`
stmt = (
    select(MockItem.description, MockItem.rating, MockItem.category)
    .where(search.match_any(MockItem.description, "Sleek running shoes"))
)

with Session(engine) as session:
    {
        "rows_alias": session.execute(stmt_alias).all(),
        "rows": session.execute(stmt).all(),
    }

# Query against `description_simple`
MockItem.search(:description_simple)
        .match_any("Sleek running shoes")
        .select(:description, :rating, :category)

# Query against `description`
MockItem.search(:description)
        .match_any("Sleek running shoes")
        .select(:description, :rating, :category)

// Query against `description_simple`
await dbContext
    .MockItems.Where(item =>
        EF.Functions.MatchAny(
            EF.Functions.Alias(item.Description, "description_simple"),
            "Sleek running shoes"
        )
    )
    .Select(item => new { item.Description, item.Rating, item.Category })
    .ToListAsync();

// Query against `description`
await dbContext
    .MockItems.Where(item =>
        EF.Functions.MatchAny(item.Description, "Sleek running shoes")
    )
    .Select(item => new { item.Description, item.Rating, item.Category })
    .ToListAsync();

If a text field uses multiple tokenizers and one of them is literal, we recommend aliasing the other tokenizers and leaving the literal tokenizer un-aliased. This is so queries that GROUP BY, ORDER BY, or aggregate the text field can reference the field directly:

SQL

CREATE INDEX search_idx ON mock_items
USING bm25 (
  id,
  (description::pdb.literal),
  (description::pdb.simple('alias=description_simple'))
) WITH (key_field='id');

SELECT description, rating, category
FROM mock_items
WHERE description @@@ 'shoes'
ORDER BY description
LIMIT 5;