feat(search): rank ParadeDB search results by BM25 relevance (#2690)
When ParadeDB is in use and a search is run, results now keep the current fuzzy/OR matching but are ordered by BM25 relevance so tasks matching all query words rank above tasks matching only some. Details: - ParadeDB exposes the BM25 score via pdb.score(<key_field>); Vikunja's key_field is id, so we order by pdb.score(tasks.id) DESC, then the existing order-by (ending in a stable tasks.id tiebreak). - Gating: relevance ordering only applies when ParadeDB is available, a search term is present, AND the user did not pass an explicit sort_by. An explicit user sort still wins; relevance only replaces the default (id / position) sort. - DISTINCT requires every ORDER BY expression to appear in the SELECT list, so pdb.score(tasks.id) is added to the selected columns too (for both the plain and task_positions-join query shapes). Because xorm's Distinct() quotes each column and corrupts the function call, the ranking path uses Select(rawColumns).Distinct() instead. - ParadeDB-only by nature: pdb.score is invalid SQL on sqlite, mysql and plain postgres, so those paths are completely unchanged. A test (TestTaskSearchRelevanceRanking) creates a task matching all query words plus tasks matching only one, then searches a multi-word query. On ParadeDB it asserts the all-words task ranks first; on other databases it only asserts the matching tasks are returned, so it stays green across the whole CI database matrix. The CI ParadeDB matrix entry exercises the ranking assertion. Follow-up (not in this change): boosting results where the words appear in order / in close proximity above plain all-words matches. Fixes #2690
This commit is contained in:
parent
b6af132845
commit
9fb0d86c1b
|
|
@ -142,6 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie
|
|||
|
||||
opts = &taskSearchOptions{
|
||||
sortby: sort,
|
||||
userProvidedSort: len(tf.SortBy) > 0,
|
||||
filterIncludeNulls: tf.FilterIncludeNulls,
|
||||
filter: tf.Filter,
|
||||
filterTimezone: tf.FilterTimezone,
|
||||
|
|
|
|||
|
|
@ -353,10 +353,22 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
|
|||
limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
|
||||
cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)
|
||||
|
||||
// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
|
||||
// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
|
||||
// When searching without an explicit user sort, order by relevance so tasks
|
||||
// matching all query words rank above tasks matching only some. This is
|
||||
// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
|
||||
rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
|
||||
|
||||
var distinct = "tasks.*"
|
||||
if strings.Contains(orderby, "task_positions.") {
|
||||
distinct += ", task_positions.position"
|
||||
}
|
||||
if rankByRelevance {
|
||||
// DISTINCT requires every ORDER BY expression to appear in the SELECT list.
|
||||
distinct += ", pdb.score(tasks.id)"
|
||||
orderby = "pdb.score(tasks.id) DESC, " + orderby
|
||||
}
|
||||
|
||||
var expandSubtasks = false
|
||||
for _, expandable := range opts.expand {
|
||||
|
|
@ -374,9 +386,15 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
|
|||
))
|
||||
}
|
||||
|
||||
query := d.s.
|
||||
Distinct(distinct).
|
||||
Where(cond)
|
||||
query := d.s.Where(cond)
|
||||
if rankByRelevance {
|
||||
// xorm's Distinct() quotes each column, which corrupts the pdb.score(tasks.id)
|
||||
// function call. Select() passes the raw column list through untouched, and
|
||||
// Distinct() (no args) still emits the DISTINCT keyword.
|
||||
query = query.Select(distinct).Distinct()
|
||||
} else {
|
||||
query = query.Distinct(distinct)
|
||||
}
|
||||
if limit > 0 {
|
||||
query = query.Limit(limit, start)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,3 +54,54 @@ func TestKanbanViewBucketFiltering(t *testing.T) {
|
|||
assert.NotContains(t, taskBuckets, id)
|
||||
}
|
||||
}
|
||||
|
||||
// TestTaskSearchRelevanceRanking verifies that a multi-word search ranks the task
|
||||
// matching all words above tasks matching only some. The ranking is BM25-based and
|
||||
// therefore only enforced on ParadeDB; on other databases we only assert that the
|
||||
// matching tasks are returned (no order guarantee), keeping the test green across
|
||||
// the whole CI database matrix.
|
||||
func TestTaskSearchRelevanceRanking(t *testing.T) {
|
||||
db.LoadAndAssertFixtures(t)
|
||||
s := db.NewSession()
|
||||
defer s.Close()
|
||||
|
||||
usr := &user.User{ID: 1}
|
||||
|
||||
allWords := &Task{Title: "Backup server migration", ProjectID: 1}
|
||||
require.NoError(t, allWords.Create(s, usr))
|
||||
oneWordA := &Task{Title: "Backup of old files", ProjectID: 1}
|
||||
require.NoError(t, oneWordA.Create(s, usr))
|
||||
oneWordB := &Task{Title: "server room booking", ProjectID: 1}
|
||||
require.NoError(t, oneWordB.Create(s, usr))
|
||||
|
||||
assertRelevanceRanked := func(t *testing.T, tc *TaskCollection) {
|
||||
got, _, _, err := tc.ReadAll(s, usr, "backup server", 0, 50)
|
||||
require.NoError(t, err)
|
||||
|
||||
gotTasks, is := got.([]*Task)
|
||||
require.True(t, is)
|
||||
|
||||
gotIDs := make([]int64, len(gotTasks))
|
||||
for i, tsk := range gotTasks {
|
||||
gotIDs[i] = tsk.ID
|
||||
}
|
||||
|
||||
require.Contains(t, gotIDs, allWords.ID, "the task matching all words should be returned")
|
||||
|
||||
if db.ParadeDBAvailable() {
|
||||
require.NotEmpty(t, gotTasks)
|
||||
assert.Equal(t, allWords.ID, gotTasks[0].ID, "task matching all query words should rank first by BM25 relevance")
|
||||
}
|
||||
}
|
||||
|
||||
// Without a view: plain "tasks.*, pdb.score(tasks.id)" select.
|
||||
t.Run("no view", func(t *testing.T) {
|
||||
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1})
|
||||
})
|
||||
|
||||
// With a view: exercises the task_positions LEFT JOIN, which adds
|
||||
// task_positions.position to the DISTINCT select alongside pdb.score(tasks.id).
|
||||
t.Run("list view", func(t *testing.T) {
|
||||
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -214,6 +214,10 @@ type taskSearchOptions struct {
|
|||
projectIDs []int64
|
||||
expand []TaskCollectionExpandable
|
||||
projectViewID int64
|
||||
|
||||
// userProvidedSort distinguishes an explicit sort_by from the id/position
|
||||
// defaults appended later, so relevance ordering only replaces the default sort.
|
||||
userProvidedSort bool
|
||||
}
|
||||
|
||||
// ReadAll is a dummy function to still have that endpoint documented
|
||||
|
|
|
|||
Loading…
Reference in New Issue