fix(search): rank exact task-index match before BM25 text relevance on ParadeDB

The BM25 relevance ranking added `pdb.score(tasks.id)` to the search SELECT
and ORDER BY. ParadeDB can only compute a score for a pure-ParadeDB query
shape, so two cases produced "pq: Unsupported query shape":

1. A numeric search (e.g. "#17") OR's the ParadeDB `|||` operators with a
   plain `"index" = N` equality in the same boolean group. Scoring that mixed
   group is unsupported.
2. When favorites are in scope, the `project_id IN (...) OR id IN (<favorites
   subquery>)` predicate is unsupported under pdb.score regardless of how the
   subquery is expressed (OR or UNION) - it just was never exercised because
   the ranking tests searched a single project with no favorites.

Both are now handled so each query ParadeDB scores is a supported shape:

- Numeric search runs as two arms: an exact `index = N` arm (no score, ranked
  first) and a text `|||` arm scored by pdb.score DESC. The arms are merged in
  Go (index matches first, deduped by task id) and paginated in memory; the
  count query keeps the combined `OR index = N` predicate (no score), which is
  a supported shape, so totalItems stays correct.
- The relevance arms reach favorites through a LEFT JOIN and scope on the
  joined column (`rank_favorites.entity_id IS NOT NULL`) instead of an
  id-IN-subquery, which ParadeDB can score.

Non-numeric (pure text) searches keep the single pdb.score-ordered query.
Non-ParadeDB databases are unchanged (no pdb.score, no ranking).

TestTaskSearchRelevanceRankingNumericIndex covers the numeric case: on
ParadeDB the exact-index task ranks first, then text matches by relevance; on
other databases it only asserts the matches are returned.

Validated against the CI-pinned ParadeDB image (paradedb 0.21.12): the full
pkg/models and pkg/webtests suites pass, including
TestTaskCollection_ReadAll/search_for_task_index and the HTTP search tests.
This commit is contained in:
kolaente 2026-06-19 22:52:26 +02:00
parent 9fb0d86c1b
commit 116fb1e2e0
2 changed files with 205 additions and 51 deletions

View File

@ -320,11 +320,17 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
// Then return all tasks for that projects
var where builder.Cond
// textSearchCond holds only the ParadeDB/ILIKE title+description match, kept
// separate from the index-equality match so the relevance ranking path can
// score a pure-ParadeDB query (see rankByRelevance below).
var textSearchCond builder.Cond
var searchIndex int64
if opts.search != "" {
where = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks")
textSearchCond = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks")
where = textSearchCond
searchIndex := getTaskIndexFromSearchString(opts.search)
searchIndex = getTaskIndexFromSearchString(opts.search)
if searchIndex > 0 {
where = builder.Or(where, builder.Eq{"`index`": searchIndex})
}
@ -350,8 +356,10 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
favoritesCond = builder.In("tasks.id", favCond)
}
scopeCond := builder.Or(projectIDCond, favoritesCond)
limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)
cond := builder.And(scopeCond, where, filterCond)
// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
@ -360,15 +368,20 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
// ParadeDB's pdb.score() rejects an `id IN (<subquery>)` favorites scope (whether
// expressed as OR or UNION) as an unsupported query shape, so the relevance arms
// reach favorites through a LEFT JOIN and scope on the joined column instead,
// which it can score. Only relevant when favorites are part of the scope.
rankFavoritesJoin := rankByRelevance && d.hasFavoritesProject
rankScopeCond := scopeCond
if rankFavoritesJoin {
rankScopeCond = builder.Or(projectIDCond, builder.Expr("rank_favorites.entity_id IS NOT NULL"))
}
var distinct = "tasks.*"
if strings.Contains(orderby, "task_positions.") {
distinct += ", task_positions.position"
}
if rankByRelevance {
// DISTINCT requires every ORDER BY expression to appear in the SELECT list.
distinct += ", pdb.score(tasks.id)"
orderby = "pdb.score(tasks.id) DESC, " + orderby
}
var expandSubtasks = false
for _, expandable := range opts.expand {
@ -378,56 +391,124 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
}
}
if expandSubtasks {
cond = builder.And(cond, builder.Or(
builder.IsNull{"task_relations.id"},
builder.IsNull{"parent_tasks.id"},
builder.Expr("parent_tasks.project_id != tasks.project_id"),
))
}
query := d.s.Where(cond)
if rankByRelevance {
// xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id)
// function call. Select() passes the raw column list through untouched, and
// Distinct() (no args) still emits the DISTINCT keyword.
query = query.Select(distinct).Distinct()
} else {
query = query.Distinct(distinct)
}
if limit > 0 {
query = query.Limit(limit, start)
}
for _, param := range opts.sortby {
if param.sortBy == taskPropertyPosition {
query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID)
break
// addJoins applies the same LEFT JOINs the count query and every fetch arm
// rely on (position sort, bucket filter, subtask expansion).
addJoins := func(query *xorm.Session) *xorm.Session {
for _, param := range opts.sortby {
if param.sortBy == taskPropertyPosition {
query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID)
break
}
}
if joinTaskBuckets {
joinCond := "task_buckets.task_id = tasks.id"
if opts.projectViewID > 0 {
joinCond += " AND task_buckets.project_view_id = ?"
query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID)
} else {
query = query.Join("LEFT", "task_buckets", joinCond)
}
}
if expandSubtasks {
query = query.
Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'").
Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id")
}
return query
}
if joinTaskBuckets {
joinCond := "task_buckets.task_id = tasks.id"
if opts.projectViewID > 0 {
joinCond += " AND task_buckets.project_view_id = ?"
query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID)
subtaskParentCond := builder.Or(
builder.IsNull{"task_relations.id"},
builder.IsNull{"parent_tasks.id"},
builder.Expr("parent_tasks.project_id != tasks.project_id"),
)
if expandSubtasks {
cond = builder.And(cond, subtaskParentCond)
}
// fetchTasks runs a single fetch arm: it builds the DISTINCT select (raw, so
// xorm doesn't quote-corrupt the pdb.score function call), applies the joins
// and the given order. paginate=false fetches every matching row so the caller
// can merge multiple arms and slice the combined result in Go.
fetchTasks := func(armCond builder.Cond, selectCols, armOrderby string, paginate bool) ([]*Task, error) {
query := d.s.Where(armCond)
if selectCols == distinct {
query = query.Distinct(selectCols)
} else {
query = query.Join("LEFT", "task_buckets", joinCond)
// Select() passes the raw column list through untouched while Distinct()
// (no args) still emits the DISTINCT keyword.
query = query.Select(selectCols).Distinct()
}
}
if expandSubtasks {
query = query.
Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'").
Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id")
if paginate && limit > 0 {
query = query.Limit(limit, start)
}
if rankFavoritesJoin {
query = query.Join("LEFT", "favorites rank_favorites", "rank_favorites.entity_id = tasks.id AND rank_favorites.user_id = ? AND rank_favorites.kind = ?", d.a.GetID(), FavoriteKindTask)
}
query = addJoins(query)
armTasks := []*Task{}
if err := query.OrderBy(armOrderby).Find(&armTasks); err != nil {
sql, vals := query.LastSQL()
return nil, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals)
}
return armTasks, nil
}
tasks = []*Task{}
err = query.
OrderBy(orderby).
Find(&tasks)
if err != nil {
sql, vals := query.LastSQL()
return nil, 0, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals)
rankCondWith := func(searchCond builder.Cond) builder.Cond {
c := builder.And(rankScopeCond, searchCond, filterCond)
if expandSubtasks {
c = builder.And(c, subtaskParentCond)
}
return c
}
switch {
case rankByRelevance && searchIndex > 0:
// A numeric search matches both the task index and the fuzzy text. pdb.score
// can only score a pure-ParadeDB query, so a `||| ... OR index = N` group is
// an unsupported query shape on ParadeDB. Run two supported arms instead and
// rank exact index matches first, then text matches by relevance.
indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"`index`": searchIndex}), distinct, orderby, false)
if err != nil {
return nil, 0, err
}
textTasks, err := fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, false)
if err != nil {
return nil, 0, err
}
// Exact index matches rank first; dedup a task matching both arms in favour
// of its index-match position.
seen := make(map[int64]bool, len(indexTasks)+len(textTasks))
merged := make([]*Task, 0, len(indexTasks)+len(textTasks))
for _, t := range indexTasks {
if !seen[t.ID] {
seen[t.ID] = true
merged = append(merged, t)
}
}
for _, t := range textTasks {
if !seen[t.ID] {
seen[t.ID] = true
merged = append(merged, t)
}
}
tasks = paginateInMemory(merged, limit, start)
case rankByRelevance:
// Pure text search: a single pdb.score-ordered query over the score-able
// scope is a supported shape.
tasks, err = fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, true)
if err != nil {
return nil, 0, err
}
default:
tasks, err = fetchTasks(cond, distinct, orderby, true)
if err != nil {
return nil, 0, err
}
}
// fetch subtasks when expanding
@ -503,3 +584,16 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
}
return
}
// paginateInMemory slices an already-ordered result set. limit == 0 means "no
// limit" (return everything from start onwards), matching getLimitFromPageIndex.
func paginateInMemory(tasks []*Task, limit, start int) []*Task {
if start >= len(tasks) {
return []*Task{}
}
tasks = tasks[start:]
if limit > 0 && limit < len(tasks) {
tasks = tasks[:limit]
}
return tasks
}

View File

@ -17,6 +17,7 @@
package models
import (
"strconv"
"testing"
"code.vikunja.io/api/pkg/db"
@ -105,3 +106,62 @@ func TestTaskSearchRelevanceRanking(t *testing.T) {
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
})
}
// TestTaskSearchRelevanceRankingNumericIndex covers a numeric search (e.g. "#42"):
// it matches both a task by its per-project index and tasks whose title/description
// contain that number via fuzzy text search. On ParadeDB the exact-index task must
// rank first, then the text matches by relevance. This is the case that combines an
// `index = N` equality with the ParadeDB ||| operators; scoring such a mixed boolean
// group is an unsupported query shape, so it is run as two arms (index, then text).
func TestTaskSearchRelevanceRankingNumericIndex(t *testing.T) {
db.LoadAndAssertFixtures(t)
s := db.NewSession()
defer s.Close()
usr := &user.User{ID: 1}
// The exact-index task: its index is what we search for. Its title deliberately
// does not contain the number, so it can only be found by the index match.
exactIndex := &Task{Title: "Quarterly planning offsite", ProjectID: 1}
require.NoError(t, exactIndex.Create(s, usr))
require.NotZero(t, exactIndex.Index)
indexStr := strconv.FormatInt(exactIndex.Index, 10)
search := "#" + indexStr
// Text matches: their titles contain the searched number so the fuzzy text arm
// returns them, but they are not the exact-index task.
textA := &Task{Title: "Review ticket " + search + " backlog", ProjectID: 1}
require.NoError(t, textA.Create(s, usr))
textB := &Task{Title: "Notes about " + search, ProjectID: 1}
require.NoError(t, textB.Create(s, usr))
assertIndexFirst := func(t *testing.T, tc *TaskCollection) {
got, _, _, err := tc.ReadAll(s, usr, search, 0, 50)
require.NoError(t, err)
gotTasks, is := got.([]*Task)
require.True(t, is)
gotIDs := make([]int64, len(gotTasks))
for i, tsk := range gotTasks {
gotIDs[i] = tsk.ID
}
require.Contains(t, gotIDs, exactIndex.ID, "the exact-index task should be returned")
if db.ParadeDBAvailable() {
require.NotEmpty(t, gotTasks)
assert.Equal(t, exactIndex.ID, gotTasks[0].ID, "the exact-index match should rank first")
assert.Contains(t, gotIDs, textA.ID, "text matches should also be returned, ranked after the index match")
}
}
t.Run("no view", func(t *testing.T) {
assertIndexFirst(t, &TaskCollection{ProjectID: 1})
})
t.Run("list view", func(t *testing.T) {
assertIndexFirst(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
})
}