From 9fb0d86c1b497b2f2e599bf3b238adcabb00fda2 Mon Sep 17 00:00:00 2001 From: kolaente Date: Fri, 19 Jun 2026 20:46:28 +0200 Subject: [PATCH 1/5] feat(search): rank ParadeDB search results by BM25 relevance (#2690) When ParadeDB is in use and a search is run, results now keep the current fuzzy/OR matching but are ordered by BM25 relevance so tasks matching all query words rank above tasks matching only some. Details: - ParadeDB exposes the BM25 score via pdb.score(); Vikunja's key_field is id, so we order by pdb.score(tasks.id) DESC, then the existing order-by (ending in a stable tasks.id tiebreak). - Gating: relevance ordering only applies when ParadeDB is available, a search term is present, AND the user did not pass an explicit sort_by. An explicit user sort still wins; relevance only replaces the default (id / position) sort. - DISTINCT requires every ORDER BY expression to appear in the SELECT list, so pdb.score(tasks.id) is added to the selected columns too (for both the plain and task_positions-join query shapes). Because xorm's Distinct() quotes each column and corrupts the function call, the ranking path uses Select(rawColumns).Distinct() instead. - ParadeDB-only by nature: pdb.score is invalid SQL on sqlite, mysql and plain postgres, so those paths are completely unchanged. A test (TestTaskSearchRelevanceRanking) creates a task matching all query words plus tasks matching only one, then searches a multi-word query. On ParadeDB it asserts the all-words task ranks first; on other databases it only asserts the matching tasks are returned, so it stays green across the whole CI database matrix. The CI ParadeDB matrix entry exercises the ranking assertion. Follow-up (not in this change): boosting results where the words appear in order / in close proximity above plain all-words matches. Fixes #2690 --- pkg/models/task_collection.go | 1 + pkg/models/task_search.go | 24 ++++++++++++++-- pkg/models/task_search_test.go | 51 ++++++++++++++++++++++++++++++++++ pkg/models/tasks.go | 4 +++ 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/pkg/models/task_collection.go b/pkg/models/task_collection.go index bc217f7ca..442c3e657 100644 --- a/pkg/models/task_collection.go +++ b/pkg/models/task_collection.go @@ -142,6 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie opts = &taskSearchOptions{ sortby: sort, + userProvidedSort: len(tf.SortBy) > 0, filterIncludeNulls: tf.FilterIncludeNulls, filter: tf.Filter, filterTimezone: tf.FilterTimezone, diff --git a/pkg/models/task_search.go b/pkg/models/task_search.go index 07da3f809..e6d06f4e8 100644 --- a/pkg/models/task_search.go +++ b/pkg/models/task_search.go @@ -353,10 +353,22 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo limit, start := getLimitFromPageIndex(opts.page, opts.perPage) cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond) + // ParadeDB exposes the BM25 relevance score via pdb.score() for any + // query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). + // When searching without an explicit user sort, order by relevance so tasks + // matching all query words rank above tasks matching only some. This is + // ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres. + rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort + var distinct = "tasks.*" if strings.Contains(orderby, "task_positions.") { distinct += ", task_positions.position" } + if rankByRelevance { + // DISTINCT requires every ORDER BY expression to appear in the SELECT list. + distinct += ", pdb.score(tasks.id)" + orderby = "pdb.score(tasks.id) DESC, " + orderby + } var expandSubtasks = false for _, expandable := range opts.expand { @@ -374,9 +386,15 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo )) } - query := d.s. - Distinct(distinct). - Where(cond) + query := d.s.Where(cond) + if rankByRelevance { + // xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id) + // function call. Select() passes the raw column list through untouched, and + // Distinct() (no args) still emits the DISTINCT keyword. + query = query.Select(distinct).Distinct() + } else { + query = query.Distinct(distinct) + } if limit > 0 { query = query.Limit(limit, start) } diff --git a/pkg/models/task_search_test.go b/pkg/models/task_search_test.go index 83e52700a..e5af2e95d 100644 --- a/pkg/models/task_search_test.go +++ b/pkg/models/task_search_test.go @@ -54,3 +54,54 @@ func TestKanbanViewBucketFiltering(t *testing.T) { assert.NotContains(t, taskBuckets, id) } } + +// TestTaskSearchRelevanceRanking verifies that a multi-word search ranks the task +// matching all words above tasks matching only some. The ranking is BM25-based and +// therefore only enforced on ParadeDB; on other databases we only assert that the +// matching tasks are returned (no order guarantee), keeping the test green across +// the whole CI database matrix. +func TestTaskSearchRelevanceRanking(t *testing.T) { + db.LoadAndAssertFixtures(t) + s := db.NewSession() + defer s.Close() + + usr := &user.User{ID: 1} + + allWords := &Task{Title: "Backup server migration", ProjectID: 1} + require.NoError(t, allWords.Create(s, usr)) + oneWordA := &Task{Title: "Backup of old files", ProjectID: 1} + require.NoError(t, oneWordA.Create(s, usr)) + oneWordB := &Task{Title: "server room booking", ProjectID: 1} + require.NoError(t, oneWordB.Create(s, usr)) + + assertRelevanceRanked := func(t *testing.T, tc *TaskCollection) { + got, _, _, err := tc.ReadAll(s, usr, "backup server", 0, 50) + require.NoError(t, err) + + gotTasks, is := got.([]*Task) + require.True(t, is) + + gotIDs := make([]int64, len(gotTasks)) + for i, tsk := range gotTasks { + gotIDs[i] = tsk.ID + } + + require.Contains(t, gotIDs, allWords.ID, "the task matching all words should be returned") + + if db.ParadeDBAvailable() { + require.NotEmpty(t, gotTasks) + assert.Equal(t, allWords.ID, gotTasks[0].ID, "task matching all query words should rank first by BM25 relevance") + } + } + + // Without a view: plain "tasks.*, pdb.score(tasks.id)" select. + t.Run("no view", func(t *testing.T) { + assertRelevanceRanked(t, &TaskCollection{ProjectID: 1}) + }) + + // With a view: exercises the task_positions LEFT JOIN, which adds + // task_positions.position to the DISTINCT select alongside pdb.score(tasks.id). + t.Run("list view", func(t *testing.T) { + assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1}) + }) +} diff --git a/pkg/models/tasks.go b/pkg/models/tasks.go index 978a0f850..262d2269d 100644 --- a/pkg/models/tasks.go +++ b/pkg/models/tasks.go @@ -214,6 +214,10 @@ type taskSearchOptions struct { projectIDs []int64 expand []TaskCollectionExpandable projectViewID int64 + + // userProvidedSort distinguishes an explicit sort_by from the id/position + // defaults appended later, so relevance ordering only replaces the default sort. + userProvidedSort bool } // ReadAll is a dummy function to still have that endpoint documented From 116fb1e2e0c535884926fd90bab9fee8bd2f7b34 Mon Sep 17 00:00:00 2001 From: kolaente Date: Fri, 19 Jun 2026 22:52:26 +0200 Subject: [PATCH 2/5] fix(search): rank exact task-index match before BM25 text relevance on ParadeDB The BM25 relevance ranking added `pdb.score(tasks.id)` to the search SELECT and ORDER BY. ParadeDB can only compute a score for a pure-ParadeDB query shape, so two cases produced "pq: Unsupported query shape": 1. A numeric search (e.g. "#17") OR's the ParadeDB `|||` operators with a plain `"index" = N` equality in the same boolean group. Scoring that mixed group is unsupported. 2. When favorites are in scope, the `project_id IN (...) OR id IN ()` predicate is unsupported under pdb.score regardless of how the subquery is expressed (OR or UNION) - it just was never exercised because the ranking tests searched a single project with no favorites. Both are now handled so each query ParadeDB scores is a supported shape: - Numeric search runs as two arms: an exact `index = N` arm (no score, ranked first) and a text `|||` arm scored by pdb.score DESC. The arms are merged in Go (index matches first, deduped by task id) and paginated in memory; the count query keeps the combined `OR index = N` predicate (no score), which is a supported shape, so totalItems stays correct. - The relevance arms reach favorites through a LEFT JOIN and scope on the joined column (`rank_favorites.entity_id IS NOT NULL`) instead of an id-IN-subquery, which ParadeDB can score. Non-numeric (pure text) searches keep the single pdb.score-ordered query. Non-ParadeDB databases are unchanged (no pdb.score, no ranking). TestTaskSearchRelevanceRankingNumericIndex covers the numeric case: on ParadeDB the exact-index task ranks first, then text matches by relevance; on other databases it only asserts the matches are returned. Validated against the CI-pinned ParadeDB image (paradedb 0.21.12): the full pkg/models and pkg/webtests suites pass, including TestTaskCollection_ReadAll/search_for_task_index and the HTTP search tests. --- pkg/models/task_search.go | 196 ++++++++++++++++++++++++--------- pkg/models/task_search_test.go | 60 ++++++++++ 2 files changed, 205 insertions(+), 51 deletions(-) diff --git a/pkg/models/task_search.go b/pkg/models/task_search.go index e6d06f4e8..f3e137d84 100644 --- a/pkg/models/task_search.go +++ b/pkg/models/task_search.go @@ -320,11 +320,17 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo // Then return all tasks for that projects var where builder.Cond + // textSearchCond holds only the ParadeDB/ILIKE title+description match, kept + // separate from the index-equality match so the relevance ranking path can + // score a pure-ParadeDB query (see rankByRelevance below). + var textSearchCond builder.Cond + var searchIndex int64 if opts.search != "" { - where = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks") + textSearchCond = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks") + where = textSearchCond - searchIndex := getTaskIndexFromSearchString(opts.search) + searchIndex = getTaskIndexFromSearchString(opts.search) if searchIndex > 0 { where = builder.Or(where, builder.Eq{"`index`": searchIndex}) } @@ -350,8 +356,10 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo favoritesCond = builder.In("tasks.id", favCond) } + scopeCond := builder.Or(projectIDCond, favoritesCond) + limit, start := getLimitFromPageIndex(opts.page, opts.perPage) - cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond) + cond := builder.And(scopeCond, where, filterCond) // ParadeDB exposes the BM25 relevance score via pdb.score() for any // query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). @@ -360,15 +368,20 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo // ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres. rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort + // ParadeDB's pdb.score() rejects an `id IN ()` favorites scope (whether + // expressed as OR or UNION) as an unsupported query shape, so the relevance arms + // reach favorites through a LEFT JOIN and scope on the joined column instead, + // which it can score. Only relevant when favorites are part of the scope. + rankFavoritesJoin := rankByRelevance && d.hasFavoritesProject + rankScopeCond := scopeCond + if rankFavoritesJoin { + rankScopeCond = builder.Or(projectIDCond, builder.Expr("rank_favorites.entity_id IS NOT NULL")) + } + var distinct = "tasks.*" if strings.Contains(orderby, "task_positions.") { distinct += ", task_positions.position" } - if rankByRelevance { - // DISTINCT requires every ORDER BY expression to appear in the SELECT list. - distinct += ", pdb.score(tasks.id)" - orderby = "pdb.score(tasks.id) DESC, " + orderby - } var expandSubtasks = false for _, expandable := range opts.expand { @@ -378,56 +391,124 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo } } - if expandSubtasks { - cond = builder.And(cond, builder.Or( - builder.IsNull{"task_relations.id"}, - builder.IsNull{"parent_tasks.id"}, - builder.Expr("parent_tasks.project_id != tasks.project_id"), - )) - } - - query := d.s.Where(cond) - if rankByRelevance { - // xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id) - // function call. Select() passes the raw column list through untouched, and - // Distinct() (no args) still emits the DISTINCT keyword. - query = query.Select(distinct).Distinct() - } else { - query = query.Distinct(distinct) - } - if limit > 0 { - query = query.Limit(limit, start) - } - - for _, param := range opts.sortby { - if param.sortBy == taskPropertyPosition { - query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID) - break + // addJoins applies the same LEFT JOINs the count query and every fetch arm + // rely on (position sort, bucket filter, subtask expansion). + addJoins := func(query *xorm.Session) *xorm.Session { + for _, param := range opts.sortby { + if param.sortBy == taskPropertyPosition { + query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID) + break + } } + if joinTaskBuckets { + joinCond := "task_buckets.task_id = tasks.id" + if opts.projectViewID > 0 { + joinCond += " AND task_buckets.project_view_id = ?" + query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID) + } else { + query = query.Join("LEFT", "task_buckets", joinCond) + } + } + if expandSubtasks { + query = query. + Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'"). + Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id") + } + return query } - if joinTaskBuckets { - joinCond := "task_buckets.task_id = tasks.id" - if opts.projectViewID > 0 { - joinCond += " AND task_buckets.project_view_id = ?" - query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID) + subtaskParentCond := builder.Or( + builder.IsNull{"task_relations.id"}, + builder.IsNull{"parent_tasks.id"}, + builder.Expr("parent_tasks.project_id != tasks.project_id"), + ) + if expandSubtasks { + cond = builder.And(cond, subtaskParentCond) + } + + // fetchTasks runs a single fetch arm: it builds the DISTINCT select (raw, so + // xorm doesn't quote-corrupt the pdb.score function call), applies the joins + // and the given order. paginate=false fetches every matching row so the caller + // can merge multiple arms and slice the combined result in Go. + fetchTasks := func(armCond builder.Cond, selectCols, armOrderby string, paginate bool) ([]*Task, error) { + query := d.s.Where(armCond) + if selectCols == distinct { + query = query.Distinct(selectCols) } else { - query = query.Join("LEFT", "task_buckets", joinCond) + // Select() passes the raw column list through untouched while Distinct() + // (no args) still emits the DISTINCT keyword. + query = query.Select(selectCols).Distinct() } - } - if expandSubtasks { - query = query. - Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'"). - Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id") + if paginate && limit > 0 { + query = query.Limit(limit, start) + } + if rankFavoritesJoin { + query = query.Join("LEFT", "favorites rank_favorites", "rank_favorites.entity_id = tasks.id AND rank_favorites.user_id = ? AND rank_favorites.kind = ?", d.a.GetID(), FavoriteKindTask) + } + query = addJoins(query) + + armTasks := []*Task{} + if err := query.OrderBy(armOrderby).Find(&armTasks); err != nil { + sql, vals := query.LastSQL() + return nil, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals) + } + return armTasks, nil } - tasks = []*Task{} - err = query. - OrderBy(orderby). - Find(&tasks) - if err != nil { - sql, vals := query.LastSQL() - return nil, 0, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals) + rankCondWith := func(searchCond builder.Cond) builder.Cond { + c := builder.And(rankScopeCond, searchCond, filterCond) + if expandSubtasks { + c = builder.And(c, subtaskParentCond) + } + return c + } + + switch { + case rankByRelevance && searchIndex > 0: + // A numeric search matches both the task index and the fuzzy text. pdb.score + // can only score a pure-ParadeDB query, so a `||| ... OR index = N` group is + // an unsupported query shape on ParadeDB. Run two supported arms instead and + // rank exact index matches first, then text matches by relevance. + indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"`index`": searchIndex}), distinct, orderby, false) + if err != nil { + return nil, 0, err + } + + textTasks, err := fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, false) + if err != nil { + return nil, 0, err + } + + // Exact index matches rank first; dedup a task matching both arms in favour + // of its index-match position. + seen := make(map[int64]bool, len(indexTasks)+len(textTasks)) + merged := make([]*Task, 0, len(indexTasks)+len(textTasks)) + for _, t := range indexTasks { + if !seen[t.ID] { + seen[t.ID] = true + merged = append(merged, t) + } + } + for _, t := range textTasks { + if !seen[t.ID] { + seen[t.ID] = true + merged = append(merged, t) + } + } + + tasks = paginateInMemory(merged, limit, start) + case rankByRelevance: + // Pure text search: a single pdb.score-ordered query over the score-able + // scope is a supported shape. + tasks, err = fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, true) + if err != nil { + return nil, 0, err + } + default: + tasks, err = fetchTasks(cond, distinct, orderby, true) + if err != nil { + return nil, 0, err + } } // fetch subtasks when expanding @@ -503,3 +584,16 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo } return } + +// paginateInMemory slices an already-ordered result set. limit == 0 means "no +// limit" (return everything from start onwards), matching getLimitFromPageIndex. +func paginateInMemory(tasks []*Task, limit, start int) []*Task { + if start >= len(tasks) { + return []*Task{} + } + tasks = tasks[start:] + if limit > 0 && limit < len(tasks) { + tasks = tasks[:limit] + } + return tasks +} diff --git a/pkg/models/task_search_test.go b/pkg/models/task_search_test.go index e5af2e95d..59eab6051 100644 --- a/pkg/models/task_search_test.go +++ b/pkg/models/task_search_test.go @@ -17,6 +17,7 @@ package models import ( + "strconv" "testing" "code.vikunja.io/api/pkg/db" @@ -105,3 +106,62 @@ func TestTaskSearchRelevanceRanking(t *testing.T) { assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1}) }) } + +// TestTaskSearchRelevanceRankingNumericIndex covers a numeric search (e.g. "#42"): +// it matches both a task by its per-project index and tasks whose title/description +// contain that number via fuzzy text search. On ParadeDB the exact-index task must +// rank first, then the text matches by relevance. This is the case that combines an +// `index = N` equality with the ParadeDB ||| operators; scoring such a mixed boolean +// group is an unsupported query shape, so it is run as two arms (index, then text). +func TestTaskSearchRelevanceRankingNumericIndex(t *testing.T) { + db.LoadAndAssertFixtures(t) + s := db.NewSession() + defer s.Close() + + usr := &user.User{ID: 1} + + // The exact-index task: its index is what we search for. Its title deliberately + // does not contain the number, so it can only be found by the index match. + exactIndex := &Task{Title: "Quarterly planning offsite", ProjectID: 1} + require.NoError(t, exactIndex.Create(s, usr)) + require.NotZero(t, exactIndex.Index) + + indexStr := strconv.FormatInt(exactIndex.Index, 10) + search := "#" + indexStr + + // Text matches: their titles contain the searched number so the fuzzy text arm + // returns them, but they are not the exact-index task. + textA := &Task{Title: "Review ticket " + search + " backlog", ProjectID: 1} + require.NoError(t, textA.Create(s, usr)) + textB := &Task{Title: "Notes about " + search, ProjectID: 1} + require.NoError(t, textB.Create(s, usr)) + + assertIndexFirst := func(t *testing.T, tc *TaskCollection) { + got, _, _, err := tc.ReadAll(s, usr, search, 0, 50) + require.NoError(t, err) + + gotTasks, is := got.([]*Task) + require.True(t, is) + + gotIDs := make([]int64, len(gotTasks)) + for i, tsk := range gotTasks { + gotIDs[i] = tsk.ID + } + + require.Contains(t, gotIDs, exactIndex.ID, "the exact-index task should be returned") + + if db.ParadeDBAvailable() { + require.NotEmpty(t, gotTasks) + assert.Equal(t, exactIndex.ID, gotTasks[0].ID, "the exact-index match should rank first") + assert.Contains(t, gotIDs, textA.ID, "text matches should also be returned, ranked after the index match") + } + } + + t.Run("no view", func(t *testing.T) { + assertIndexFirst(t, &TaskCollection{ProjectID: 1}) + }) + + t.Run("list view", func(t *testing.T) { + assertIndexFirst(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1}) + }) +} From d93e98f76b43fe8c0e44024985d3d7cb20dc12a1 Mon Sep 17 00:00:00 2001 From: kolaente Date: Fri, 19 Jun 2026 23:14:51 +0200 Subject: [PATCH 3/5] fix(search): qualify the task index column to avoid ambiguity with the parent-task join --- pkg/models/task_search.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/models/task_search.go b/pkg/models/task_search.go index f3e137d84..a8884ba7f 100644 --- a/pkg/models/task_search.go +++ b/pkg/models/task_search.go @@ -332,7 +332,7 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo searchIndex = getTaskIndexFromSearchString(opts.search) if searchIndex > 0 { - where = builder.Or(where, builder.Eq{"`index`": searchIndex}) + where = builder.Or(where, builder.Eq{"tasks.`index`": searchIndex}) } } @@ -469,7 +469,7 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo // can only score a pure-ParadeDB query, so a `||| ... OR index = N` group is // an unsupported query shape on ParadeDB. Run two supported arms instead and // rank exact index matches first, then text matches by relevance. - indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"`index`": searchIndex}), distinct, orderby, false) + indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"tasks.`index`": searchIndex}), distinct, orderby, false) if err != nil { return nil, 0, err } From 78dde2fb18c9a13e83c0cacf720ea70621fc0691 Mon Sep 17 00:00:00 2001 From: kolaente Date: Fri, 19 Jun 2026 23:14:55 +0200 Subject: [PATCH 4/5] fix(search): derive userProvidedSort from the effective sort so relevance ranking applies in negative-id views --- pkg/models/task_collection.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/models/task_collection.go b/pkg/models/task_collection.go index 442c3e657..fdf1f3e09 100644 --- a/pkg/models/task_collection.go +++ b/pkg/models/task_collection.go @@ -142,7 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie opts = &taskSearchOptions{ sortby: sort, - userProvidedSort: len(tf.SortBy) > 0, + userProvidedSort: len(sort) > 0, filterIncludeNulls: tf.FilterIncludeNulls, filter: tf.Filter, filterTimezone: tf.FilterTimezone, From cefa42da86fec92057b648eb6e757b2f6c7cc186 Mon Sep 17 00:00:00 2001 From: kolaente Date: Sun, 21 Jun 2026 18:49:41 +0200 Subject: [PATCH 5/5] refactor(search): limit BM25 relevance ranking to pure-text searches Rank ParadeDB search results by BM25 relevance only for pure-text searches over a plain project scope. Numeric searches (the `OR index = N` branch) and the Favorites view (the `id IN ()` scope) keep the default ordering (unranked, as on main): pdb.score rejects both as unsupported query shapes, and the contortions previously needed to score them (two-arm numeric merge with in-memory pagination, a favorites LEFT JOIN) added far more complexity than the ranking was worth. Neither path was ranked before this PR, so leaving them at the default order is no regression. --- pkg/models/task_collection.go | 2 +- pkg/models/task_search.go | 215 ++++++++++----------------------- pkg/models/task_search_test.go | 60 --------- 3 files changed, 64 insertions(+), 213 deletions(-) diff --git a/pkg/models/task_collection.go b/pkg/models/task_collection.go index fdf1f3e09..442c3e657 100644 --- a/pkg/models/task_collection.go +++ b/pkg/models/task_collection.go @@ -142,7 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie opts = &taskSearchOptions{ sortby: sort, - userProvidedSort: len(sort) > 0, + userProvidedSort: len(tf.SortBy) > 0, filterIncludeNulls: tf.FilterIncludeNulls, filter: tf.Filter, filterTimezone: tf.FilterTimezone, diff --git a/pkg/models/task_search.go b/pkg/models/task_search.go index a8884ba7f..5eb7f2b2a 100644 --- a/pkg/models/task_search.go +++ b/pkg/models/task_search.go @@ -320,17 +320,11 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo // Then return all tasks for that projects var where builder.Cond - // textSearchCond holds only the ParadeDB/ILIKE title+description match, kept - // separate from the index-equality match so the relevance ranking path can - // score a pure-ParadeDB query (see rankByRelevance below). - var textSearchCond builder.Cond - var searchIndex int64 + searchIndex := getTaskIndexFromSearchString(opts.search) if opts.search != "" { - textSearchCond = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks") - where = textSearchCond + where = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks") - searchIndex = getTaskIndexFromSearchString(opts.search) if searchIndex > 0 { where = builder.Or(where, builder.Eq{"tasks.`index`": searchIndex}) } @@ -356,27 +350,8 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo favoritesCond = builder.In("tasks.id", favCond) } - scopeCond := builder.Or(projectIDCond, favoritesCond) - limit, start := getLimitFromPageIndex(opts.page, opts.perPage) - cond := builder.And(scopeCond, where, filterCond) - - // ParadeDB exposes the BM25 relevance score via pdb.score() for any - // query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). - // When searching without an explicit user sort, order by relevance so tasks - // matching all query words rank above tasks matching only some. This is - // ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres. - rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort - - // ParadeDB's pdb.score() rejects an `id IN ()` favorites scope (whether - // expressed as OR or UNION) as an unsupported query shape, so the relevance arms - // reach favorites through a LEFT JOIN and scope on the joined column instead, - // which it can score. Only relevant when favorites are part of the scope. - rankFavoritesJoin := rankByRelevance && d.hasFavoritesProject - rankScopeCond := scopeCond - if rankFavoritesJoin { - rankScopeCond = builder.Or(projectIDCond, builder.Expr("rank_favorites.entity_id IS NOT NULL")) - } + cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond) var distinct = "tasks.*" if strings.Contains(orderby, "task_positions.") { @@ -391,124 +366,73 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo } } - // addJoins applies the same LEFT JOINs the count query and every fetch arm - // rely on (position sort, bucket filter, subtask expansion). - addJoins := func(query *xorm.Session) *xorm.Session { - for _, param := range opts.sortby { - if param.sortBy == taskPropertyPosition { - query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID) - break - } - } - if joinTaskBuckets { - joinCond := "task_buckets.task_id = tasks.id" - if opts.projectViewID > 0 { - joinCond += " AND task_buckets.project_view_id = ?" - query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID) - } else { - query = query.Join("LEFT", "task_buckets", joinCond) - } - } - if expandSubtasks { - query = query. - Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'"). - Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id") - } - return query - } - - subtaskParentCond := builder.Or( - builder.IsNull{"task_relations.id"}, - builder.IsNull{"parent_tasks.id"}, - builder.Expr("parent_tasks.project_id != tasks.project_id"), - ) if expandSubtasks { - cond = builder.And(cond, subtaskParentCond) + cond = builder.And(cond, builder.Or( + builder.IsNull{"task_relations.id"}, + builder.IsNull{"parent_tasks.id"}, + builder.Expr("parent_tasks.project_id != tasks.project_id"), + )) } - // fetchTasks runs a single fetch arm: it builds the DISTINCT select (raw, so - // xorm doesn't quote-corrupt the pdb.score function call), applies the joins - // and the given order. paginate=false fetches every matching row so the caller - // can merge multiple arms and slice the combined result in Go. - fetchTasks := func(armCond builder.Cond, selectCols, armOrderby string, paginate bool) ([]*Task, error) { - query := d.s.Where(armCond) - if selectCols == distinct { - query = query.Distinct(selectCols) + // ParadeDB exposes the BM25 relevance score via pdb.score(tasks.id) for a query + // containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). When + // searching without an explicit user sort, order by relevance so tasks matching + // all query words rank above tasks matching only some. + // + // This is limited to pure-text searches over a plain project scope: numeric + // searches add an `OR index = N` branch and the Favorites view scopes on an + // `id IN ()`, both of which pdb.score rejects as unsupported query + // shapes. Those keep the default ordering (unranked). pdb.score is also invalid + // SQL on sqlite/mysql/plain postgres, hence the ParadeDBAvailable() gate. + rankByRelevance := db.ParadeDBAvailable() && + opts.search != "" && + !opts.userProvidedSort && + searchIndex == 0 && + !d.hasFavoritesProject + + query := d.s.Where(cond) + if rankByRelevance { + // Select() passes the raw column list through untouched while Distinct() + // (no args) still emits DISTINCT. Distinct("tasks.*, pdb.score(tasks.id)") + // would quote-corrupt the function call into "pdb"."score(tasks"."id)". + query = query.Select(distinct + ", pdb.score(tasks.id)").Distinct() + orderby = "pdb.score(tasks.id) DESC, " + orderby + } else { + query = query.Distinct(distinct) + } + if limit > 0 { + query = query.Limit(limit, start) + } + + for _, param := range opts.sortby { + if param.sortBy == taskPropertyPosition { + query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID) + break + } + } + + if joinTaskBuckets { + joinCond := "task_buckets.task_id = tasks.id" + if opts.projectViewID > 0 { + joinCond += " AND task_buckets.project_view_id = ?" + query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID) } else { - // Select() passes the raw column list through untouched while Distinct() - // (no args) still emits the DISTINCT keyword. - query = query.Select(selectCols).Distinct() + query = query.Join("LEFT", "task_buckets", joinCond) } - if paginate && limit > 0 { - query = query.Limit(limit, start) - } - if rankFavoritesJoin { - query = query.Join("LEFT", "favorites rank_favorites", "rank_favorites.entity_id = tasks.id AND rank_favorites.user_id = ? AND rank_favorites.kind = ?", d.a.GetID(), FavoriteKindTask) - } - query = addJoins(query) - - armTasks := []*Task{} - if err := query.OrderBy(armOrderby).Find(&armTasks); err != nil { - sql, vals := query.LastSQL() - return nil, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals) - } - return armTasks, nil + } + if expandSubtasks { + query = query. + Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'"). + Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id") } - rankCondWith := func(searchCond builder.Cond) builder.Cond { - c := builder.And(rankScopeCond, searchCond, filterCond) - if expandSubtasks { - c = builder.And(c, subtaskParentCond) - } - return c - } - - switch { - case rankByRelevance && searchIndex > 0: - // A numeric search matches both the task index and the fuzzy text. pdb.score - // can only score a pure-ParadeDB query, so a `||| ... OR index = N` group is - // an unsupported query shape on ParadeDB. Run two supported arms instead and - // rank exact index matches first, then text matches by relevance. - indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"tasks.`index`": searchIndex}), distinct, orderby, false) - if err != nil { - return nil, 0, err - } - - textTasks, err := fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, false) - if err != nil { - return nil, 0, err - } - - // Exact index matches rank first; dedup a task matching both arms in favour - // of its index-match position. - seen := make(map[int64]bool, len(indexTasks)+len(textTasks)) - merged := make([]*Task, 0, len(indexTasks)+len(textTasks)) - for _, t := range indexTasks { - if !seen[t.ID] { - seen[t.ID] = true - merged = append(merged, t) - } - } - for _, t := range textTasks { - if !seen[t.ID] { - seen[t.ID] = true - merged = append(merged, t) - } - } - - tasks = paginateInMemory(merged, limit, start) - case rankByRelevance: - // Pure text search: a single pdb.score-ordered query over the score-able - // scope is a supported shape. - tasks, err = fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, true) - if err != nil { - return nil, 0, err - } - default: - tasks, err = fetchTasks(cond, distinct, orderby, true) - if err != nil { - return nil, 0, err - } + tasks = []*Task{} + err = query. + OrderBy(orderby). + Find(&tasks) + if err != nil { + sql, vals := query.LastSQL() + return nil, 0, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals) } // fetch subtasks when expanding @@ -584,16 +508,3 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo } return } - -// paginateInMemory slices an already-ordered result set. limit == 0 means "no -// limit" (return everything from start onwards), matching getLimitFromPageIndex. -func paginateInMemory(tasks []*Task, limit, start int) []*Task { - if start >= len(tasks) { - return []*Task{} - } - tasks = tasks[start:] - if limit > 0 && limit < len(tasks) { - tasks = tasks[:limit] - } - return tasks -} diff --git a/pkg/models/task_search_test.go b/pkg/models/task_search_test.go index 59eab6051..e5af2e95d 100644 --- a/pkg/models/task_search_test.go +++ b/pkg/models/task_search_test.go @@ -17,7 +17,6 @@ package models import ( - "strconv" "testing" "code.vikunja.io/api/pkg/db" @@ -106,62 +105,3 @@ func TestTaskSearchRelevanceRanking(t *testing.T) { assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1}) }) } - -// TestTaskSearchRelevanceRankingNumericIndex covers a numeric search (e.g. "#42"): -// it matches both a task by its per-project index and tasks whose title/description -// contain that number via fuzzy text search. On ParadeDB the exact-index task must -// rank first, then the text matches by relevance. This is the case that combines an -// `index = N` equality with the ParadeDB ||| operators; scoring such a mixed boolean -// group is an unsupported query shape, so it is run as two arms (index, then text). -func TestTaskSearchRelevanceRankingNumericIndex(t *testing.T) { - db.LoadAndAssertFixtures(t) - s := db.NewSession() - defer s.Close() - - usr := &user.User{ID: 1} - - // The exact-index task: its index is what we search for. Its title deliberately - // does not contain the number, so it can only be found by the index match. - exactIndex := &Task{Title: "Quarterly planning offsite", ProjectID: 1} - require.NoError(t, exactIndex.Create(s, usr)) - require.NotZero(t, exactIndex.Index) - - indexStr := strconv.FormatInt(exactIndex.Index, 10) - search := "#" + indexStr - - // Text matches: their titles contain the searched number so the fuzzy text arm - // returns them, but they are not the exact-index task. - textA := &Task{Title: "Review ticket " + search + " backlog", ProjectID: 1} - require.NoError(t, textA.Create(s, usr)) - textB := &Task{Title: "Notes about " + search, ProjectID: 1} - require.NoError(t, textB.Create(s, usr)) - - assertIndexFirst := func(t *testing.T, tc *TaskCollection) { - got, _, _, err := tc.ReadAll(s, usr, search, 0, 50) - require.NoError(t, err) - - gotTasks, is := got.([]*Task) - require.True(t, is) - - gotIDs := make([]int64, len(gotTasks)) - for i, tsk := range gotTasks { - gotIDs[i] = tsk.ID - } - - require.Contains(t, gotIDs, exactIndex.ID, "the exact-index task should be returned") - - if db.ParadeDBAvailable() { - require.NotEmpty(t, gotTasks) - assert.Equal(t, exactIndex.ID, gotTasks[0].ID, "the exact-index match should rank first") - assert.Contains(t, gotIDs, textA.ID, "text matches should also be returned, ranked after the index match") - } - } - - t.Run("no view", func(t *testing.T) { - assertIndexFirst(t, &TaskCollection{ProjectID: 1}) - }) - - t.Run("list view", func(t *testing.T) { - assertIndexFirst(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1}) - }) -}