refactor(search): limit BM25 relevance ranking to pure-text searches

Rank ParadeDB search results by BM25 relevance only for pure-text searches over a plain project scope. Numeric searches (the `OR index = N` branch) and the Favorites view (the `id IN (<subquery>)` scope) keep the default ordering (unranked, as on main): pdb.score rejects both as unsupported query shapes, and the contortions previously needed to score them (two-arm numeric merge with in-memory pagination, a favorites LEFT JOIN) added far more complexity than the ranking was worth. Neither path was ranked before this PR, so leaving them at the default order is no regression.
2026-06-21 18:49:41 +02:00 · 2026-06-21 18:49:41 +02:00 · cefa42da86
parent 78dde2fb18
commit cefa42da86
3 changed files with 64 additions and 213 deletions
--- a/pkg/models/task_collection.go
+++ b/pkg/models/task_collection.go
@ -142,7 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie
 	opts = &taskSearchOptions{
 		sortby:             sort,
-		userProvidedSort:   len(sort) > 0,
+		userProvidedSort:   len(tf.SortBy) > 0,
 		filterIncludeNulls: tf.FilterIncludeNulls,
 		filter:             tf.Filter,
 		filterTimezone:     tf.FilterTimezone,
--- a/pkg/models/task_search.go
+++ b/pkg/models/task_search.go
@ -320,17 +320,11 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 	// Then return all tasks for that projects
 	var where builder.Cond
 	// textSearchCond holds only the ParadeDB/ILIKE title+description match, kept
 	// separate from the index-equality match so the relevance ranking path can
 	// score a pure-ParadeDB query (see rankByRelevance below).
 	var textSearchCond builder.Cond
 	var searchIndex int64
 	searchIndex := getTaskIndexFromSearchString(opts.search)
 	if opts.search != "" {
-		textSearchCond = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks")
+		where = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks")
 		where = textSearchCond
 		searchIndex = getTaskIndexFromSearchString(opts.search)
 		if searchIndex > 0 {
 			where = builder.Or(where, builder.Eq{"tasks.`index`": searchIndex})
 		}
@ -356,27 +350,8 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 		favoritesCond = builder.In("tasks.id", favCond)
 	}
 	scopeCond := builder.Or(projectIDCond, favoritesCond)
 	limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
-	cond := builder.And(scopeCond, where, filterCond)
+	cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)
 	// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
 	// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
 	// When searching without an explicit user sort, order by relevance so tasks
 	// matching all query words rank above tasks matching only some. This is
 	// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
 	rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
 	// ParadeDB's pdb.score() rejects an `id IN (<subquery>)` favorites scope (whether
 	// expressed as OR or UNION) as an unsupported query shape, so the relevance arms
 	// reach favorites through a LEFT JOIN and scope on the joined column instead,
 	// which it can score. Only relevant when favorites are part of the scope.
 	rankFavoritesJoin := rankByRelevance && d.hasFavoritesProject
 	rankScopeCond := scopeCond
 	if rankFavoritesJoin {
 		rankScopeCond = builder.Or(projectIDCond, builder.Expr("rank_favorites.entity_id IS NOT NULL"))
 	}
 	var distinct = "tasks.*"
 	if strings.Contains(orderby, "task_positions.") {
@ -391,124 +366,73 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 		}
 	}
 	// addJoins applies the same LEFT JOINs the count query and every fetch arm
 	// rely on (position sort, bucket filter, subtask expansion).
 	addJoins := func(query *xorm.Session) *xorm.Session {
 		for _, param := range opts.sortby {
 			if param.sortBy == taskPropertyPosition {
 				query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID)
 				break
 			}
 		}
 		if joinTaskBuckets {
 			joinCond := "task_buckets.task_id = tasks.id"
 			if opts.projectViewID > 0 {
 				joinCond += " AND task_buckets.project_view_id = ?"
 				query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID)
 			} else {
 				query = query.Join("LEFT", "task_buckets", joinCond)
 			}
 		}
 		if expandSubtasks {
 			query = query.
 				Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'").
 				Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id")
 		}
 		return query
 	}
 	subtaskParentCond := builder.Or(
 		builder.IsNull{"task_relations.id"},
 		builder.IsNull{"parent_tasks.id"},
 		builder.Expr("parent_tasks.project_id != tasks.project_id"),
 	)
 	if expandSubtasks {
-		cond = builder.And(cond, subtaskParentCond)
+		cond = builder.And(cond, builder.Or(
 			builder.IsNull{"task_relations.id"},
 			builder.IsNull{"parent_tasks.id"},
 			builder.Expr("parent_tasks.project_id != tasks.project_id"),
 		))
 	}
-	// fetchTasks runs a single fetch arm: it builds the DISTINCT select (raw, so
+	// ParadeDB exposes the BM25 relevance score via pdb.score(tasks.id) for a query
-	// xorm doesn't quote-corrupt the pdb.score function call), applies the joins
+	// containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). When
-	// and the given order. paginate=false fetches every matching row so the caller
+	// searching without an explicit user sort, order by relevance so tasks matching
-	// can merge multiple arms and slice the combined result in Go.
+	// all query words rank above tasks matching only some.
-	fetchTasks := func(armCond builder.Cond, selectCols, armOrderby string, paginate bool) ([]*Task, error) {
+	//
-		query := d.s.Where(armCond)
+	// This is limited to pure-text searches over a plain project scope: numeric
-		if selectCols == distinct {
+	// searches add an `OR index = N` branch and the Favorites view scopes on an
-			query = query.Distinct(selectCols)
+	// `id IN (<subquery>)`, both of which pdb.score rejects as unsupported query
 	// shapes. Those keep the default ordering (unranked). pdb.score is also invalid
 	// SQL on sqlite/mysql/plain postgres, hence the ParadeDBAvailable() gate.
 	rankByRelevance := db.ParadeDBAvailable() &&
 		opts.search != "" &&
 		!opts.userProvidedSort &&
 		searchIndex == 0 &&
 		!d.hasFavoritesProject
 	query := d.s.Where(cond)
 	if rankByRelevance {
 		// Select() passes the raw column list through untouched while Distinct()
 		// (no args) still emits DISTINCT. Distinct("tasks.*, pdb.score(tasks.id)")
 		// would quote-corrupt the function call into "pdb"."score(tasks"."id)".
 		query = query.Select(distinct + ", pdb.score(tasks.id)").Distinct()
 		orderby = "pdb.score(tasks.id) DESC, " + orderby
 	} else {
 		query = query.Distinct(distinct)
 	}
 	if limit > 0 {
 		query = query.Limit(limit, start)
 	}
 	for _, param := range opts.sortby {
 		if param.sortBy == taskPropertyPosition {
 			query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID)
 			break
 		}
 	}
 	if joinTaskBuckets {
 		joinCond := "task_buckets.task_id = tasks.id"
 		if opts.projectViewID > 0 {
 			joinCond += " AND task_buckets.project_view_id = ?"
 			query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID)
 		} else {
-			// Select() passes the raw column list through untouched while Distinct()
+			query = query.Join("LEFT", "task_buckets", joinCond)
 			// (no args) still emits the DISTINCT keyword.
 			query = query.Select(selectCols).Distinct()
 		}
-		if paginate && limit > 0 {
+	}
-			query = query.Limit(limit, start)
+	if expandSubtasks {
-		}
+		query = query.
-		if rankFavoritesJoin {
+			Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'").
-			query = query.Join("LEFT", "favorites rank_favorites", "rank_favorites.entity_id = tasks.id AND rank_favorites.user_id = ? AND rank_favorites.kind = ?", d.a.GetID(), FavoriteKindTask)
+			Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id")
 		}
 		query = addJoins(query)
 		armTasks := []*Task{}
 		if err := query.OrderBy(armOrderby).Find(&armTasks); err != nil {
 			sql, vals := query.LastSQL()
 			return nil, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals)
 		}
 		return armTasks, nil
 	}
-	rankCondWith := func(searchCond builder.Cond) builder.Cond {
+	tasks = []*Task{}
-		c := builder.And(rankScopeCond, searchCond, filterCond)
+	err = query.
-		if expandSubtasks {
+		OrderBy(orderby).
-			c = builder.And(c, subtaskParentCond)
+		Find(&tasks)
-		}
+	if err != nil {
-		return c
+		sql, vals := query.LastSQL()
-	}
+		return nil, 0, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals)
 	switch {
 	case rankByRelevance && searchIndex > 0:
 		// A numeric search matches both the task index and the fuzzy text. pdb.score
 		// can only score a pure-ParadeDB query, so a `||| ... OR index = N` group is
 		// an unsupported query shape on ParadeDB. Run two supported arms instead and
 		// rank exact index matches first, then text matches by relevance.
 		indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"tasks.`index`": searchIndex}), distinct, orderby, false)
 		if err != nil {
 			return nil, 0, err
 		}
 		textTasks, err := fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, false)
 		if err != nil {
 			return nil, 0, err
 		}
 		// Exact index matches rank first; dedup a task matching both arms in favour
 		// of its index-match position.
 		seen := make(map[int64]bool, len(indexTasks)+len(textTasks))
 		merged := make([]*Task, 0, len(indexTasks)+len(textTasks))
 		for _, t := range indexTasks {
 			if !seen[t.ID] {
 				seen[t.ID] = true
 				merged = append(merged, t)
 			}
 		}
 		for _, t := range textTasks {
 			if !seen[t.ID] {
 				seen[t.ID] = true
 				merged = append(merged, t)
 			}
 		}
 		tasks = paginateInMemory(merged, limit, start)
 	case rankByRelevance:
 		// Pure text search: a single pdb.score-ordered query over the score-able
 		// scope is a supported shape.
 		tasks, err = fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, true)
 		if err != nil {
 			return nil, 0, err
 		}
 	default:
 		tasks, err = fetchTasks(cond, distinct, orderby, true)
 		if err != nil {
 			return nil, 0, err
 		}
 	}
 	// fetch subtasks when expanding
@ -584,16 +508,3 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
 	}
 	return
 }
 // paginateInMemory slices an already-ordered result set. limit == 0 means "no
 // limit" (return everything from start onwards), matching getLimitFromPageIndex.
 func paginateInMemory(tasks []*Task, limit, start int) []*Task {
 	if start >= len(tasks) {
 		return []*Task{}
 	}
 	tasks = tasks[start:]
 	if limit > 0 && limit < len(tasks) {
 		tasks = tasks[:limit]
 	}
 	return tasks
 }
--- a/pkg/models/task_search_test.go
+++ b/pkg/models/task_search_test.go
@ -17,7 +17,6 @@
 package models
 import (
 	"strconv"
 	"testing"
 	"code.vikunja.io/api/pkg/db"
@ -106,62 +105,3 @@ func TestTaskSearchRelevanceRanking(t *testing.T) {
 		assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
 	})
 }
 // TestTaskSearchRelevanceRankingNumericIndex covers a numeric search (e.g. "#42"):
 // it matches both a task by its per-project index and tasks whose title/description
 // contain that number via fuzzy text search. On ParadeDB the exact-index task must
 // rank first, then the text matches by relevance. This is the case that combines an
 // `index = N` equality with the ParadeDB ||| operators; scoring such a mixed boolean
 // group is an unsupported query shape, so it is run as two arms (index, then text).
 func TestTaskSearchRelevanceRankingNumericIndex(t *testing.T) {
 	db.LoadAndAssertFixtures(t)
 	s := db.NewSession()
 	defer s.Close()
 	usr := &user.User{ID: 1}
 	// The exact-index task: its index is what we search for. Its title deliberately
 	// does not contain the number, so it can only be found by the index match.
 	exactIndex := &Task{Title: "Quarterly planning offsite", ProjectID: 1}
 	require.NoError(t, exactIndex.Create(s, usr))
 	require.NotZero(t, exactIndex.Index)
 	indexStr := strconv.FormatInt(exactIndex.Index, 10)
 	search := "#" + indexStr
 	// Text matches: their titles contain the searched number so the fuzzy text arm
 	// returns them, but they are not the exact-index task.
 	textA := &Task{Title: "Review ticket " + search + " backlog", ProjectID: 1}
 	require.NoError(t, textA.Create(s, usr))
 	textB := &Task{Title: "Notes about " + search, ProjectID: 1}
 	require.NoError(t, textB.Create(s, usr))
 	assertIndexFirst := func(t *testing.T, tc *TaskCollection) {
 		got, _, _, err := tc.ReadAll(s, usr, search, 0, 50)
 		require.NoError(t, err)
 		gotTasks, is := got.([]*Task)
 		require.True(t, is)
 		gotIDs := make([]int64, len(gotTasks))
 		for i, tsk := range gotTasks {
 			gotIDs[i] = tsk.ID
 		}
 		require.Contains(t, gotIDs, exactIndex.ID, "the exact-index task should be returned")
 		if db.ParadeDBAvailable() {
 			require.NotEmpty(t, gotTasks)
 			assert.Equal(t, exactIndex.ID, gotTasks[0].ID, "the exact-index match should rank first")
 			assert.Contains(t, gotIDs, textA.ID, "text matches should also be returned, ranked after the index match")
 		}
 	}
 	t.Run("no view", func(t *testing.T) {
 		assertIndexFirst(t, &TaskCollection{ProjectID: 1})
 	})
 	t.Run("list view", func(t *testing.T) {
 		assertIndexFirst(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
 	})
 }