From 9fb0d86c1b497b2f2e599bf3b238adcabb00fda2 Mon Sep 17 00:00:00 2001 From: kolaente Date: Fri, 19 Jun 2026 20:46:28 +0200 Subject: [PATCH] feat(search): rank ParadeDB search results by BM25 relevance (#2690) When ParadeDB is in use and a search is run, results now keep the current fuzzy/OR matching but are ordered by BM25 relevance so tasks matching all query words rank above tasks matching only some. Details: - ParadeDB exposes the BM25 score via pdb.score(); Vikunja's key_field is id, so we order by pdb.score(tasks.id) DESC, then the existing order-by (ending in a stable tasks.id tiebreak). - Gating: relevance ordering only applies when ParadeDB is available, a search term is present, AND the user did not pass an explicit sort_by. An explicit user sort still wins; relevance only replaces the default (id / position) sort. - DISTINCT requires every ORDER BY expression to appear in the SELECT list, so pdb.score(tasks.id) is added to the selected columns too (for both the plain and task_positions-join query shapes). Because xorm's Distinct() quotes each column and corrupts the function call, the ranking path uses Select(rawColumns).Distinct() instead. - ParadeDB-only by nature: pdb.score is invalid SQL on sqlite, mysql and plain postgres, so those paths are completely unchanged. A test (TestTaskSearchRelevanceRanking) creates a task matching all query words plus tasks matching only one, then searches a multi-word query. On ParadeDB it asserts the all-words task ranks first; on other databases it only asserts the matching tasks are returned, so it stays green across the whole CI database matrix. The CI ParadeDB matrix entry exercises the ranking assertion. Follow-up (not in this change): boosting results where the words appear in order / in close proximity above plain all-words matches. Fixes #2690 --- pkg/models/task_collection.go | 1 + pkg/models/task_search.go | 24 ++++++++++++++-- pkg/models/task_search_test.go | 51 ++++++++++++++++++++++++++++++++++ pkg/models/tasks.go | 4 +++ 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/pkg/models/task_collection.go b/pkg/models/task_collection.go index bc217f7ca..442c3e657 100644 --- a/pkg/models/task_collection.go +++ b/pkg/models/task_collection.go @@ -142,6 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie opts = &taskSearchOptions{ sortby: sort, + userProvidedSort: len(tf.SortBy) > 0, filterIncludeNulls: tf.FilterIncludeNulls, filter: tf.Filter, filterTimezone: tf.FilterTimezone, diff --git a/pkg/models/task_search.go b/pkg/models/task_search.go index 07da3f809..e6d06f4e8 100644 --- a/pkg/models/task_search.go +++ b/pkg/models/task_search.go @@ -353,10 +353,22 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo limit, start := getLimitFromPageIndex(opts.page, opts.perPage) cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond) + // ParadeDB exposes the BM25 relevance score via pdb.score() for any + // query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). + // When searching without an explicit user sort, order by relevance so tasks + // matching all query words rank above tasks matching only some. This is + // ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres. + rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort + var distinct = "tasks.*" if strings.Contains(orderby, "task_positions.") { distinct += ", task_positions.position" } + if rankByRelevance { + // DISTINCT requires every ORDER BY expression to appear in the SELECT list. + distinct += ", pdb.score(tasks.id)" + orderby = "pdb.score(tasks.id) DESC, " + orderby + } var expandSubtasks = false for _, expandable := range opts.expand { @@ -374,9 +386,15 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo )) } - query := d.s. - Distinct(distinct). - Where(cond) + query := d.s.Where(cond) + if rankByRelevance { + // xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id) + // function call. Select() passes the raw column list through untouched, and + // Distinct() (no args) still emits the DISTINCT keyword. + query = query.Select(distinct).Distinct() + } else { + query = query.Distinct(distinct) + } if limit > 0 { query = query.Limit(limit, start) } diff --git a/pkg/models/task_search_test.go b/pkg/models/task_search_test.go index 83e52700a..e5af2e95d 100644 --- a/pkg/models/task_search_test.go +++ b/pkg/models/task_search_test.go @@ -54,3 +54,54 @@ func TestKanbanViewBucketFiltering(t *testing.T) { assert.NotContains(t, taskBuckets, id) } } + +// TestTaskSearchRelevanceRanking verifies that a multi-word search ranks the task +// matching all words above tasks matching only some. The ranking is BM25-based and +// therefore only enforced on ParadeDB; on other databases we only assert that the +// matching tasks are returned (no order guarantee), keeping the test green across +// the whole CI database matrix. +func TestTaskSearchRelevanceRanking(t *testing.T) { + db.LoadAndAssertFixtures(t) + s := db.NewSession() + defer s.Close() + + usr := &user.User{ID: 1} + + allWords := &Task{Title: "Backup server migration", ProjectID: 1} + require.NoError(t, allWords.Create(s, usr)) + oneWordA := &Task{Title: "Backup of old files", ProjectID: 1} + require.NoError(t, oneWordA.Create(s, usr)) + oneWordB := &Task{Title: "server room booking", ProjectID: 1} + require.NoError(t, oneWordB.Create(s, usr)) + + assertRelevanceRanked := func(t *testing.T, tc *TaskCollection) { + got, _, _, err := tc.ReadAll(s, usr, "backup server", 0, 50) + require.NoError(t, err) + + gotTasks, is := got.([]*Task) + require.True(t, is) + + gotIDs := make([]int64, len(gotTasks)) + for i, tsk := range gotTasks { + gotIDs[i] = tsk.ID + } + + require.Contains(t, gotIDs, allWords.ID, "the task matching all words should be returned") + + if db.ParadeDBAvailable() { + require.NotEmpty(t, gotTasks) + assert.Equal(t, allWords.ID, gotTasks[0].ID, "task matching all query words should rank first by BM25 relevance") + } + } + + // Without a view: plain "tasks.*, pdb.score(tasks.id)" select. + t.Run("no view", func(t *testing.T) { + assertRelevanceRanked(t, &TaskCollection{ProjectID: 1}) + }) + + // With a view: exercises the task_positions LEFT JOIN, which adds + // task_positions.position to the DISTINCT select alongside pdb.score(tasks.id). + t.Run("list view", func(t *testing.T) { + assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1}) + }) +} diff --git a/pkg/models/tasks.go b/pkg/models/tasks.go index 978a0f850..262d2269d 100644 --- a/pkg/models/tasks.go +++ b/pkg/models/tasks.go @@ -214,6 +214,10 @@ type taskSearchOptions struct { projectIDs []int64 expand []TaskCollectionExpandable projectViewID int64 + + // userProvidedSort distinguishes an explicit sort_by from the id/position + // defaults appended later, so relevance ordering only replaces the default sort. + userProvidedSort bool } // ReadAll is a dummy function to still have that endpoint documented