feat(search): rank ParadeDB search results by BM25 relevance (#2690)

When ParadeDB is in use and a search is run, results now keep the current
fuzzy/OR matching but are ordered by BM25 relevance so tasks matching all
query words rank above tasks matching only some.

Details:
- ParadeDB exposes the BM25 score via pdb.score(<key_field>); Vikunja's
  key_field is id, so we order by pdb.score(tasks.id) DESC, then the
  existing order-by (ending in a stable tasks.id tiebreak).
- Gating: relevance ordering only applies when ParadeDB is available, a
  search term is present, AND the user did not pass an explicit sort_by.
  An explicit user sort still wins; relevance only replaces the default
  (id / position) sort.
- DISTINCT requires every ORDER BY expression to appear in the SELECT
  list, so pdb.score(tasks.id) is added to the selected columns too (for
  both the plain and task_positions-join query shapes). Because xorm's
  Distinct() quotes each column and corrupts the function call, the
  ranking path uses Select(rawColumns).Distinct() instead.
- ParadeDB-only by nature: pdb.score is invalid SQL on sqlite, mysql and
  plain postgres, so those paths are completely unchanged.

A test (TestTaskSearchRelevanceRanking) creates a task matching all query
words plus tasks matching only one, then searches a multi-word query. On
ParadeDB it asserts the all-words task ranks first; on other databases it
only asserts the matching tasks are returned, so it stays green across the
whole CI database matrix. The CI ParadeDB matrix entry exercises the
ranking assertion.

Follow-up (not in this change): boosting results where the words appear in
order / in close proximity above plain all-words matches.

Fixes #2690
This commit is contained in:
kolaente 2026-06-19 20:46:28 +02:00
parent b6af132845
commit 9fb0d86c1b
4 changed files with 77 additions and 3 deletions

View File

@ -142,6 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie
opts = &taskSearchOptions{
sortby: sort,
userProvidedSort: len(tf.SortBy) > 0,
filterIncludeNulls: tf.FilterIncludeNulls,
filter: tf.Filter,
filterTimezone: tf.FilterTimezone,

View File

@ -353,10 +353,22 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)
// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
// When searching without an explicit user sort, order by relevance so tasks
// matching all query words rank above tasks matching only some. This is
// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
var distinct = "tasks.*"
if strings.Contains(orderby, "task_positions.") {
distinct += ", task_positions.position"
}
if rankByRelevance {
// DISTINCT requires every ORDER BY expression to appear in the SELECT list.
distinct += ", pdb.score(tasks.id)"
orderby = "pdb.score(tasks.id) DESC, " + orderby
}
var expandSubtasks = false
for _, expandable := range opts.expand {
@ -374,9 +386,15 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
))
}
query := d.s.
Distinct(distinct).
Where(cond)
query := d.s.Where(cond)
if rankByRelevance {
// xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id)
// function call. Select() passes the raw column list through untouched, and
// Distinct() (no args) still emits the DISTINCT keyword.
query = query.Select(distinct).Distinct()
} else {
query = query.Distinct(distinct)
}
if limit > 0 {
query = query.Limit(limit, start)
}

View File

@ -54,3 +54,54 @@ func TestKanbanViewBucketFiltering(t *testing.T) {
assert.NotContains(t, taskBuckets, id)
}
}
// TestTaskSearchRelevanceRanking verifies that a multi-word search ranks the task
// matching all words above tasks matching only some. The ranking is BM25-based and
// therefore only enforced on ParadeDB; on other databases we only assert that the
// matching tasks are returned (no order guarantee), keeping the test green across
// the whole CI database matrix.
func TestTaskSearchRelevanceRanking(t *testing.T) {
db.LoadAndAssertFixtures(t)
s := db.NewSession()
defer s.Close()
usr := &user.User{ID: 1}
allWords := &Task{Title: "Backup server migration", ProjectID: 1}
require.NoError(t, allWords.Create(s, usr))
oneWordA := &Task{Title: "Backup of old files", ProjectID: 1}
require.NoError(t, oneWordA.Create(s, usr))
oneWordB := &Task{Title: "server room booking", ProjectID: 1}
require.NoError(t, oneWordB.Create(s, usr))
assertRelevanceRanked := func(t *testing.T, tc *TaskCollection) {
got, _, _, err := tc.ReadAll(s, usr, "backup server", 0, 50)
require.NoError(t, err)
gotTasks, is := got.([]*Task)
require.True(t, is)
gotIDs := make([]int64, len(gotTasks))
for i, tsk := range gotTasks {
gotIDs[i] = tsk.ID
}
require.Contains(t, gotIDs, allWords.ID, "the task matching all words should be returned")
if db.ParadeDBAvailable() {
require.NotEmpty(t, gotTasks)
assert.Equal(t, allWords.ID, gotTasks[0].ID, "task matching all query words should rank first by BM25 relevance")
}
}
// Without a view: plain "tasks.*, pdb.score(tasks.id)" select.
t.Run("no view", func(t *testing.T) {
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1})
})
// With a view: exercises the task_positions LEFT JOIN, which adds
// task_positions.position to the DISTINCT select alongside pdb.score(tasks.id).
t.Run("list view", func(t *testing.T) {
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
})
}

View File

@ -214,6 +214,10 @@ type taskSearchOptions struct {
projectIDs []int64
expand []TaskCollectionExpandable
projectViewID int64
// userProvidedSort distinguishes an explicit sort_by from the id/position
// defaults appended later, so relevance ordering only replaces the default sort.
userProvidedSort bool
}
// ReadAll is a dummy function to still have that endpoint documented