feat(search): rank ParadeDB search results by BM25 relevance (#2690)

When ParadeDB is in use and a search is run, results now keep the current fuzzy/OR matching but are ordered by BM25 relevance so tasks matching all query words rank above tasks matching only some. Details: - ParadeDB exposes the BM25 score via pdb.score(<key_field>); Vikunja's key_field is id, so we order by pdb.score(tasks.id) DESC, then the existing order-by (ending in a stable tasks.id tiebreak). - Gating: relevance ordering only applies when ParadeDB is available, a search term is present, AND the user did not pass an explicit sort_by. An explicit user sort still wins; relevance only replaces the default (id / position) sort. - DISTINCT requires every ORDER BY expression to appear in the SELECT list, so pdb.score(tasks.id) is added to the selected columns too (for both the plain and task_positions-join query shapes). Because xorm's Distinct() quotes each column and corrupts the function call, the ranking path uses Select(rawColumns).Distinct() instead. - ParadeDB-only by nature: pdb.score is invalid SQL on sqlite, mysql and plain postgres, so those paths are completely unchanged. A test (TestTaskSearchRelevanceRanking) creates a task matching all query words plus tasks matching only one, then searches a multi-word query. On ParadeDB it asserts the all-words task ranks first; on other databases it only asserts the matching tasks are returned, so it stays green across the whole CI database matrix. The CI ParadeDB matrix entry exercises the ranking assertion. Follow-up (not in this change): boosting results where the words appear in order / in close proximity above plain all-words matches. Fixes #2690
2026-06-19 20:46:28 +02:00 · 2026-06-19 20:46:28 +02:00 · 9fb0d86c1b
parent b6af132845
commit 9fb0d86c1b
4 changed files with 77 additions and 3 deletions
--- a/pkg/models/task_collection.go
+++ b/pkg/models/task_collection.go
@ -142,6 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie

 	opts = &taskSearchOptions{
 		sortby:             sort,
+		userProvidedSort:   len(tf.SortBy) > 0,
 		filterIncludeNulls: tf.FilterIncludeNulls,
 		filter:             tf.Filter,
 		filterTimezone:     tf.FilterTimezone,
--- a/pkg/models/task_search.go
+++ b/pkg/models/task_search.go
@ -353,10 +353,22 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 	limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
 	cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)

+	// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
+	// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
+	// When searching without an explicit user sort, order by relevance so tasks
+	// matching all query words rank above tasks matching only some. This is
+	// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
+	rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
+
 	var distinct = "tasks.*"
 	if strings.Contains(orderby, "task_positions.") {
 		distinct += ", task_positions.position"
 	}
+	if rankByRelevance {
+		// DISTINCT requires every ORDER BY expression to appear in the SELECT list.
+		distinct += ", pdb.score(tasks.id)"
+		orderby = "pdb.score(tasks.id) DESC, " + orderby
+	}

 	var expandSubtasks = false
 	for _, expandable := range opts.expand {
@ -374,9 +386,15 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 		))
 	}

-	query := d.s.
-		Distinct(distinct).
-		Where(cond)
+	query := d.s.Where(cond)
+	if rankByRelevance {
+		// xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id)
+		// function call. Select() passes the raw column list through untouched, and
+		// Distinct() (no args) still emits the DISTINCT keyword.
+		query = query.Select(distinct).Distinct()
+	} else {
+		query = query.Distinct(distinct)
+	}
 	if limit > 0 {
 		query = query.Limit(limit, start)
 	}
--- a/pkg/models/task_search_test.go
+++ b/pkg/models/task_search_test.go
@ -54,3 +54,54 @@ func TestKanbanViewBucketFiltering(t *testing.T) {
 		assert.NotContains(t, taskBuckets, id)
 	}
 }
+
+// TestTaskSearchRelevanceRanking verifies that a multi-word search ranks the task
+// matching all words above tasks matching only some. The ranking is BM25-based and
+// therefore only enforced on ParadeDB; on other databases we only assert that the
+// matching tasks are returned (no order guarantee), keeping the test green across
+// the whole CI database matrix.
+func TestTaskSearchRelevanceRanking(t *testing.T) {
+	db.LoadAndAssertFixtures(t)
+	s := db.NewSession()
+	defer s.Close()
+
+	usr := &user.User{ID: 1}
+
+	allWords := &Task{Title: "Backup server migration", ProjectID: 1}
+	require.NoError(t, allWords.Create(s, usr))
+	oneWordA := &Task{Title: "Backup of old files", ProjectID: 1}
+	require.NoError(t, oneWordA.Create(s, usr))
+	oneWordB := &Task{Title: "server room booking", ProjectID: 1}
+	require.NoError(t, oneWordB.Create(s, usr))
+
+	assertRelevanceRanked := func(t *testing.T, tc *TaskCollection) {
+		got, _, _, err := tc.ReadAll(s, usr, "backup server", 0, 50)
+		require.NoError(t, err)
+
+		gotTasks, is := got.([]*Task)
+		require.True(t, is)
+
+		gotIDs := make([]int64, len(gotTasks))
+		for i, tsk := range gotTasks {
+			gotIDs[i] = tsk.ID
+		}
+
+		require.Contains(t, gotIDs, allWords.ID, "the task matching all words should be returned")
+
+		if db.ParadeDBAvailable() {
+			require.NotEmpty(t, gotTasks)
+			assert.Equal(t, allWords.ID, gotTasks[0].ID, "task matching all query words should rank first by BM25 relevance")
+		}
+	}
+
+	// Without a view: plain "tasks.*, pdb.score(tasks.id)" select.
+	t.Run("no view", func(t *testing.T) {
+		assertRelevanceRanked(t, &TaskCollection{ProjectID: 1})
+	})
+
+	// With a view: exercises the task_positions LEFT JOIN, which adds
+	// task_positions.position to the DISTINCT select alongside pdb.score(tasks.id).
+	t.Run("list view", func(t *testing.T) {
+		assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
+	})
+}
--- a/pkg/models/tasks.go
+++ b/pkg/models/tasks.go
@ -214,6 +214,10 @@ type taskSearchOptions struct {
 	projectIDs         []int64
 	expand             []TaskCollectionExpandable
 	projectViewID      int64
+
+	// userProvidedSort distinguishes an explicit sort_by from the id/position
+	// defaults appended later, so relevance ordering only replaces the default sort.
+	userProvidedSort bool
 }

 // ReadAll is a dummy function to still have that endpoint documented