From 9fb0d86c1b497b2f2e599bf3b238adcabb00fda2 Mon Sep 17 00:00:00 2001
From: kolaente <k@knt.li>
Date: Fri, 19 Jun 2026 20:46:28 +0200
Subject: [PATCH] feat(search): rank ParadeDB search results by BM25 relevance
 (#2690)

When ParadeDB is in use and a search is run, results now keep the current
fuzzy/OR matching but are ordered by BM25 relevance so tasks matching all
query words rank above tasks matching only some.

Details:
- ParadeDB exposes the BM25 score via pdb.score(<key_field>); Vikunja's
  key_field is id, so we order by pdb.score(tasks.id) DESC, then the
  existing order-by (ending in a stable tasks.id tiebreak).
- Gating: relevance ordering only applies when ParadeDB is available, a
  search term is present, AND the user did not pass an explicit sort_by.
  An explicit user sort still wins; relevance only replaces the default
  (id / position) sort.
- DISTINCT requires every ORDER BY expression to appear in the SELECT
  list, so pdb.score(tasks.id) is added to the selected columns too (for
  both the plain and task_positions-join query shapes). Because xorm's
  Distinct() quotes each column and corrupts the function call, the
  ranking path uses Select(rawColumns).Distinct() instead.
- ParadeDB-only by nature: pdb.score is invalid SQL on sqlite, mysql and
  plain postgres, so those paths are completely unchanged.

A test (TestTaskSearchRelevanceRanking) creates a task matching all query
words plus tasks matching only one, then searches a multi-word query. On
ParadeDB it asserts the all-words task ranks first; on other databases it
only asserts the matching tasks are returned, so it stays green across the
whole CI database matrix. The CI ParadeDB matrix entry exercises the
ranking assertion.

Follow-up (not in this change): boosting results where the words appear in
order / in close proximity above plain all-words matches.

Fixes #2690
---
 pkg/models/task_collection.go  |  1 +
 pkg/models/task_search.go      | 24 ++++++++++++++--
 pkg/models/task_search_test.go | 51 ++++++++++++++++++++++++++++++++++
 pkg/models/tasks.go            |  4 +++
 4 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/pkg/models/task_collection.go b/pkg/models/task_collection.go
index bc217f7ca..442c3e657 100644
--- a/pkg/models/task_collection.go
+++ b/pkg/models/task_collection.go
@@ -142,6 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie
 
 	opts = &taskSearchOptions{
 		sortby:             sort,
+		userProvidedSort:   len(tf.SortBy) > 0,
 		filterIncludeNulls: tf.FilterIncludeNulls,
 		filter:             tf.Filter,
 		filterTimezone:     tf.FilterTimezone,
diff --git a/pkg/models/task_search.go b/pkg/models/task_search.go
index 07da3f809..e6d06f4e8 100644
--- a/pkg/models/task_search.go
+++ b/pkg/models/task_search.go
@@ -353,10 +353,22 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 	limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
 	cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)
 
+	// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
+	// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
+	// When searching without an explicit user sort, order by relevance so tasks
+	// matching all query words rank above tasks matching only some. This is
+	// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
+	rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
+
 	var distinct = "tasks.*"
 	if strings.Contains(orderby, "task_positions.") {
 		distinct += ", task_positions.position"
 	}
+	if rankByRelevance {
+		// DISTINCT requires every ORDER BY expression to appear in the SELECT list.
+		distinct += ", pdb.score(tasks.id)"
+		orderby = "pdb.score(tasks.id) DESC, " + orderby
+	}
 
 	var expandSubtasks = false
 	for _, expandable := range opts.expand {
@@ -374,9 +386,15 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 		))
 	}
 
-	query := d.s.
-		Distinct(distinct).
-		Where(cond)
+	query := d.s.Where(cond)
+	if rankByRelevance {
+		// xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id)
+		// function call. Select() passes the raw column list through untouched, and
+		// Distinct() (no args) still emits the DISTINCT keyword.
+		query = query.Select(distinct).Distinct()
+	} else {
+		query = query.Distinct(distinct)
+	}
 	if limit > 0 {
 		query = query.Limit(limit, start)
 	}
diff --git a/pkg/models/task_search_test.go b/pkg/models/task_search_test.go
index 83e52700a..e5af2e95d 100644
--- a/pkg/models/task_search_test.go
+++ b/pkg/models/task_search_test.go
@@ -54,3 +54,54 @@ func TestKanbanViewBucketFiltering(t *testing.T) {
 		assert.NotContains(t, taskBuckets, id)
 	}
 }
+
+// TestTaskSearchRelevanceRanking verifies that a multi-word search ranks the task
+// matching all words above tasks matching only some. The ranking is BM25-based and
+// therefore only enforced on ParadeDB; on other databases we only assert that the
+// matching tasks are returned (no order guarantee), keeping the test green across
+// the whole CI database matrix.
+func TestTaskSearchRelevanceRanking(t *testing.T) {
+	db.LoadAndAssertFixtures(t)
+	s := db.NewSession()
+	defer s.Close()
+
+	usr := &user.User{ID: 1}
+
+	allWords := &Task{Title: "Backup server migration", ProjectID: 1}
+	require.NoError(t, allWords.Create(s, usr))
+	oneWordA := &Task{Title: "Backup of old files", ProjectID: 1}
+	require.NoError(t, oneWordA.Create(s, usr))
+	oneWordB := &Task{Title: "server room booking", ProjectID: 1}
+	require.NoError(t, oneWordB.Create(s, usr))
+
+	assertRelevanceRanked := func(t *testing.T, tc *TaskCollection) {
+		got, _, _, err := tc.ReadAll(s, usr, "backup server", 0, 50)
+		require.NoError(t, err)
+
+		gotTasks, is := got.([]*Task)
+		require.True(t, is)
+
+		gotIDs := make([]int64, len(gotTasks))
+		for i, tsk := range gotTasks {
+			gotIDs[i] = tsk.ID
+		}
+
+		require.Contains(t, gotIDs, allWords.ID, "the task matching all words should be returned")
+
+		if db.ParadeDBAvailable() {
+			require.NotEmpty(t, gotTasks)
+			assert.Equal(t, allWords.ID, gotTasks[0].ID, "task matching all query words should rank first by BM25 relevance")
+		}
+	}
+
+	// Without a view: plain "tasks.*, pdb.score(tasks.id)" select.
+	t.Run("no view", func(t *testing.T) {
+		assertRelevanceRanked(t, &TaskCollection{ProjectID: 1})
+	})
+
+	// With a view: exercises the task_positions LEFT JOIN, which adds
+	// task_positions.position to the DISTINCT select alongside pdb.score(tasks.id).
+	t.Run("list view", func(t *testing.T) {
+		assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
+	})
+}
diff --git a/pkg/models/tasks.go b/pkg/models/tasks.go
index 978a0f850..262d2269d 100644
--- a/pkg/models/tasks.go
+++ b/pkg/models/tasks.go
@@ -214,6 +214,10 @@ type taskSearchOptions struct {
 	projectIDs         []int64
 	expand             []TaskCollectionExpandable
 	projectViewID      int64
+
+	// userProvidedSort distinguishes an explicit sort_by from the id/position
+	// defaults appended later, so relevance ordering only replaces the default sort.
+	userProvidedSort bool
 }
 
 // ReadAll is a dummy function to still have that endpoint documented