refactor(search): limit BM25 relevance ranking to pure-text searches
Rank ParadeDB search results by BM25 relevance only for pure-text searches over a plain project scope. Numeric searches (the `OR index = N` branch) and the Favorites view (the `id IN (<subquery>)` scope) keep the default ordering (unranked, as on main): pdb.score rejects both as unsupported query shapes, and the contortions previously needed to score them (two-arm numeric merge with in-memory pagination, a favorites LEFT JOIN) added far more complexity than the ranking was worth. Neither path was ranked before this PR, so leaving them at the default order is no regression.
This commit is contained in:
parent
78dde2fb18
commit
cefa42da86
|
|
@ -142,7 +142,7 @@ func getTaskFilterOptsFromCollection(tf *TaskCollection, projectView *ProjectVie
|
||||||
|
|
||||||
opts = &taskSearchOptions{
|
opts = &taskSearchOptions{
|
||||||
sortby: sort,
|
sortby: sort,
|
||||||
userProvidedSort: len(sort) > 0,
|
userProvidedSort: len(tf.SortBy) > 0,
|
||||||
filterIncludeNulls: tf.FilterIncludeNulls,
|
filterIncludeNulls: tf.FilterIncludeNulls,
|
||||||
filter: tf.Filter,
|
filter: tf.Filter,
|
||||||
filterTimezone: tf.FilterTimezone,
|
filterTimezone: tf.FilterTimezone,
|
||||||
|
|
|
||||||
|
|
@ -320,17 +320,11 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
|
||||||
|
|
||||||
// Then return all tasks for that projects
|
// Then return all tasks for that projects
|
||||||
var where builder.Cond
|
var where builder.Cond
|
||||||
// textSearchCond holds only the ParadeDB/ILIKE title+description match, kept
|
|
||||||
// separate from the index-equality match so the relevance ranking path can
|
|
||||||
// score a pure-ParadeDB query (see rankByRelevance below).
|
|
||||||
var textSearchCond builder.Cond
|
|
||||||
var searchIndex int64
|
|
||||||
|
|
||||||
|
searchIndex := getTaskIndexFromSearchString(opts.search)
|
||||||
if opts.search != "" {
|
if opts.search != "" {
|
||||||
textSearchCond = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks")
|
where = db.MultiFieldSearchWithTableAlias([]string{"title", "description"}, opts.search, "tasks")
|
||||||
where = textSearchCond
|
|
||||||
|
|
||||||
searchIndex = getTaskIndexFromSearchString(opts.search)
|
|
||||||
if searchIndex > 0 {
|
if searchIndex > 0 {
|
||||||
where = builder.Or(where, builder.Eq{"tasks.`index`": searchIndex})
|
where = builder.Or(where, builder.Eq{"tasks.`index`": searchIndex})
|
||||||
}
|
}
|
||||||
|
|
@ -356,27 +350,8 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
|
||||||
favoritesCond = builder.In("tasks.id", favCond)
|
favoritesCond = builder.In("tasks.id", favCond)
|
||||||
}
|
}
|
||||||
|
|
||||||
scopeCond := builder.Or(projectIDCond, favoritesCond)
|
|
||||||
|
|
||||||
limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
|
limit, start := getLimitFromPageIndex(opts.page, opts.perPage)
|
||||||
cond := builder.And(scopeCond, where, filterCond)
|
cond := builder.And(builder.Or(projectIDCond, favoritesCond), where, filterCond)
|
||||||
|
|
||||||
// ParadeDB exposes the BM25 relevance score via pdb.score(<key_field>) for any
|
|
||||||
// query containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies).
|
|
||||||
// When searching without an explicit user sort, order by relevance so tasks
|
|
||||||
// matching all query words rank above tasks matching only some. This is
|
|
||||||
// ParadeDB-only: pdb.score is invalid SQL on sqlite/mysql/plain postgres.
|
|
||||||
rankByRelevance := db.ParadeDBAvailable() && opts.search != "" && !opts.userProvidedSort
|
|
||||||
|
|
||||||
// ParadeDB's pdb.score() rejects an `id IN (<subquery>)` favorites scope (whether
|
|
||||||
// expressed as OR or UNION) as an unsupported query shape, so the relevance arms
|
|
||||||
// reach favorites through a LEFT JOIN and scope on the joined column instead,
|
|
||||||
// which it can score. Only relevant when favorites are part of the scope.
|
|
||||||
rankFavoritesJoin := rankByRelevance && d.hasFavoritesProject
|
|
||||||
rankScopeCond := scopeCond
|
|
||||||
if rankFavoritesJoin {
|
|
||||||
rankScopeCond = builder.Or(projectIDCond, builder.Expr("rank_favorites.entity_id IS NOT NULL"))
|
|
||||||
}
|
|
||||||
|
|
||||||
var distinct = "tasks.*"
|
var distinct = "tasks.*"
|
||||||
if strings.Contains(orderby, "task_positions.") {
|
if strings.Contains(orderby, "task_positions.") {
|
||||||
|
|
@ -391,124 +366,73 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// addJoins applies the same LEFT JOINs the count query and every fetch arm
|
|
||||||
// rely on (position sort, bucket filter, subtask expansion).
|
|
||||||
addJoins := func(query *xorm.Session) *xorm.Session {
|
|
||||||
for _, param := range opts.sortby {
|
|
||||||
if param.sortBy == taskPropertyPosition {
|
|
||||||
query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if joinTaskBuckets {
|
|
||||||
joinCond := "task_buckets.task_id = tasks.id"
|
|
||||||
if opts.projectViewID > 0 {
|
|
||||||
joinCond += " AND task_buckets.project_view_id = ?"
|
|
||||||
query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID)
|
|
||||||
} else {
|
|
||||||
query = query.Join("LEFT", "task_buckets", joinCond)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if expandSubtasks {
|
|
||||||
query = query.
|
|
||||||
Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'").
|
|
||||||
Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id")
|
|
||||||
}
|
|
||||||
return query
|
|
||||||
}
|
|
||||||
|
|
||||||
subtaskParentCond := builder.Or(
|
|
||||||
builder.IsNull{"task_relations.id"},
|
|
||||||
builder.IsNull{"parent_tasks.id"},
|
|
||||||
builder.Expr("parent_tasks.project_id != tasks.project_id"),
|
|
||||||
)
|
|
||||||
if expandSubtasks {
|
if expandSubtasks {
|
||||||
cond = builder.And(cond, subtaskParentCond)
|
cond = builder.And(cond, builder.Or(
|
||||||
|
builder.IsNull{"task_relations.id"},
|
||||||
|
builder.IsNull{"parent_tasks.id"},
|
||||||
|
builder.Expr("parent_tasks.project_id != tasks.project_id"),
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchTasks runs a single fetch arm: it builds the DISTINCT select (raw, so
|
// ParadeDB exposes the BM25 relevance score via pdb.score(tasks.id) for a query
|
||||||
// xorm doesn't quote-corrupt the pdb.score function call), applies the joins
|
// containing a ParadeDB operator (the ||| from MultiFieldSearch qualifies). When
|
||||||
// and the given order. paginate=false fetches every matching row so the caller
|
// searching without an explicit user sort, order by relevance so tasks matching
|
||||||
// can merge multiple arms and slice the combined result in Go.
|
// all query words rank above tasks matching only some.
|
||||||
fetchTasks := func(armCond builder.Cond, selectCols, armOrderby string, paginate bool) ([]*Task, error) {
|
//
|
||||||
query := d.s.Where(armCond)
|
// This is limited to pure-text searches over a plain project scope: numeric
|
||||||
if selectCols == distinct {
|
// searches add an `OR index = N` branch and the Favorites view scopes on an
|
||||||
query = query.Distinct(selectCols)
|
// `id IN (<subquery>)`, both of which pdb.score rejects as unsupported query
|
||||||
|
// shapes. Those keep the default ordering (unranked). pdb.score is also invalid
|
||||||
|
// SQL on sqlite/mysql/plain postgres, hence the ParadeDBAvailable() gate.
|
||||||
|
rankByRelevance := db.ParadeDBAvailable() &&
|
||||||
|
opts.search != "" &&
|
||||||
|
!opts.userProvidedSort &&
|
||||||
|
searchIndex == 0 &&
|
||||||
|
!d.hasFavoritesProject
|
||||||
|
|
||||||
|
query := d.s.Where(cond)
|
||||||
|
if rankByRelevance {
|
||||||
|
// Select() passes the raw column list through untouched while Distinct()
|
||||||
|
// (no args) still emits DISTINCT. Distinct("tasks.*, pdb.score(tasks.id)")
|
||||||
|
// would quote-corrupt the function call into "pdb"."score(tasks"."id)".
|
||||||
|
query = query.Select(distinct + ", pdb.score(tasks.id)").Distinct()
|
||||||
|
orderby = "pdb.score(tasks.id) DESC, " + orderby
|
||||||
|
} else {
|
||||||
|
query = query.Distinct(distinct)
|
||||||
|
}
|
||||||
|
if limit > 0 {
|
||||||
|
query = query.Limit(limit, start)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, param := range opts.sortby {
|
||||||
|
if param.sortBy == taskPropertyPosition {
|
||||||
|
query = query.Join("LEFT", "task_positions", "task_positions.task_id = tasks.id AND task_positions.project_view_id = ?", param.projectViewID)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if joinTaskBuckets {
|
||||||
|
joinCond := "task_buckets.task_id = tasks.id"
|
||||||
|
if opts.projectViewID > 0 {
|
||||||
|
joinCond += " AND task_buckets.project_view_id = ?"
|
||||||
|
query = query.Join("LEFT", "task_buckets", joinCond, opts.projectViewID)
|
||||||
} else {
|
} else {
|
||||||
// Select() passes the raw column list through untouched while Distinct()
|
query = query.Join("LEFT", "task_buckets", joinCond)
|
||||||
// (no args) still emits the DISTINCT keyword.
|
|
||||||
query = query.Select(selectCols).Distinct()
|
|
||||||
}
|
}
|
||||||
if paginate && limit > 0 {
|
}
|
||||||
query = query.Limit(limit, start)
|
if expandSubtasks {
|
||||||
}
|
query = query.
|
||||||
if rankFavoritesJoin {
|
Join("LEFT", "task_relations", "tasks.id = task_relations.task_id and task_relations.relation_kind = 'parenttask'").
|
||||||
query = query.Join("LEFT", "favorites rank_favorites", "rank_favorites.entity_id = tasks.id AND rank_favorites.user_id = ? AND rank_favorites.kind = ?", d.a.GetID(), FavoriteKindTask)
|
Join("LEFT", "tasks parent_tasks", "task_relations.other_task_id = parent_tasks.id")
|
||||||
}
|
|
||||||
query = addJoins(query)
|
|
||||||
|
|
||||||
armTasks := []*Task{}
|
|
||||||
if err := query.OrderBy(armOrderby).Find(&armTasks); err != nil {
|
|
||||||
sql, vals := query.LastSQL()
|
|
||||||
return nil, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals)
|
|
||||||
}
|
|
||||||
return armTasks, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
rankCondWith := func(searchCond builder.Cond) builder.Cond {
|
tasks = []*Task{}
|
||||||
c := builder.And(rankScopeCond, searchCond, filterCond)
|
err = query.
|
||||||
if expandSubtasks {
|
OrderBy(orderby).
|
||||||
c = builder.And(c, subtaskParentCond)
|
Find(&tasks)
|
||||||
}
|
if err != nil {
|
||||||
return c
|
sql, vals := query.LastSQL()
|
||||||
}
|
return nil, 0, fmt.Errorf("could not fetch tasks, error was '%w', sql: '%v', values: %v", err, sql, vals)
|
||||||
|
|
||||||
switch {
|
|
||||||
case rankByRelevance && searchIndex > 0:
|
|
||||||
// A numeric search matches both the task index and the fuzzy text. pdb.score
|
|
||||||
// can only score a pure-ParadeDB query, so a `||| ... OR index = N` group is
|
|
||||||
// an unsupported query shape on ParadeDB. Run two supported arms instead and
|
|
||||||
// rank exact index matches first, then text matches by relevance.
|
|
||||||
indexTasks, err := fetchTasks(rankCondWith(builder.Eq{"tasks.`index`": searchIndex}), distinct, orderby, false)
|
|
||||||
if err != nil {
|
|
||||||
return nil, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
textTasks, err := fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, false)
|
|
||||||
if err != nil {
|
|
||||||
return nil, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Exact index matches rank first; dedup a task matching both arms in favour
|
|
||||||
// of its index-match position.
|
|
||||||
seen := make(map[int64]bool, len(indexTasks)+len(textTasks))
|
|
||||||
merged := make([]*Task, 0, len(indexTasks)+len(textTasks))
|
|
||||||
for _, t := range indexTasks {
|
|
||||||
if !seen[t.ID] {
|
|
||||||
seen[t.ID] = true
|
|
||||||
merged = append(merged, t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, t := range textTasks {
|
|
||||||
if !seen[t.ID] {
|
|
||||||
seen[t.ID] = true
|
|
||||||
merged = append(merged, t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks = paginateInMemory(merged, limit, start)
|
|
||||||
case rankByRelevance:
|
|
||||||
// Pure text search: a single pdb.score-ordered query over the score-able
|
|
||||||
// scope is a supported shape.
|
|
||||||
tasks, err = fetchTasks(rankCondWith(textSearchCond), distinct+", pdb.score(tasks.id)", "pdb.score(tasks.id) DESC, "+orderby, true)
|
|
||||||
if err != nil {
|
|
||||||
return nil, 0, err
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
tasks, err = fetchTasks(cond, distinct, orderby, true)
|
|
||||||
if err != nil {
|
|
||||||
return nil, 0, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetch subtasks when expanding
|
// fetch subtasks when expanding
|
||||||
|
|
@ -584,16 +508,3 @@ func (d *dbTaskSearcher) Search(opts *taskSearchOptions) (tasks []*Task, totalCo
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// paginateInMemory slices an already-ordered result set. limit == 0 means "no
|
|
||||||
// limit" (return everything from start onwards), matching getLimitFromPageIndex.
|
|
||||||
func paginateInMemory(tasks []*Task, limit, start int) []*Task {
|
|
||||||
if start >= len(tasks) {
|
|
||||||
return []*Task{}
|
|
||||||
}
|
|
||||||
tasks = tasks[start:]
|
|
||||||
if limit > 0 && limit < len(tasks) {
|
|
||||||
tasks = tasks[:limit]
|
|
||||||
}
|
|
||||||
return tasks
|
|
||||||
}
|
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,6 @@
|
||||||
package models
|
package models
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"strconv"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"code.vikunja.io/api/pkg/db"
|
"code.vikunja.io/api/pkg/db"
|
||||||
|
|
@ -106,62 +105,3 @@ func TestTaskSearchRelevanceRanking(t *testing.T) {
|
||||||
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
|
assertRelevanceRanked(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestTaskSearchRelevanceRankingNumericIndex covers a numeric search (e.g. "#42"):
|
|
||||||
// it matches both a task by its per-project index and tasks whose title/description
|
|
||||||
// contain that number via fuzzy text search. On ParadeDB the exact-index task must
|
|
||||||
// rank first, then the text matches by relevance. This is the case that combines an
|
|
||||||
// `index = N` equality with the ParadeDB ||| operators; scoring such a mixed boolean
|
|
||||||
// group is an unsupported query shape, so it is run as two arms (index, then text).
|
|
||||||
func TestTaskSearchRelevanceRankingNumericIndex(t *testing.T) {
|
|
||||||
db.LoadAndAssertFixtures(t)
|
|
||||||
s := db.NewSession()
|
|
||||||
defer s.Close()
|
|
||||||
|
|
||||||
usr := &user.User{ID: 1}
|
|
||||||
|
|
||||||
// The exact-index task: its index is what we search for. Its title deliberately
|
|
||||||
// does not contain the number, so it can only be found by the index match.
|
|
||||||
exactIndex := &Task{Title: "Quarterly planning offsite", ProjectID: 1}
|
|
||||||
require.NoError(t, exactIndex.Create(s, usr))
|
|
||||||
require.NotZero(t, exactIndex.Index)
|
|
||||||
|
|
||||||
indexStr := strconv.FormatInt(exactIndex.Index, 10)
|
|
||||||
search := "#" + indexStr
|
|
||||||
|
|
||||||
// Text matches: their titles contain the searched number so the fuzzy text arm
|
|
||||||
// returns them, but they are not the exact-index task.
|
|
||||||
textA := &Task{Title: "Review ticket " + search + " backlog", ProjectID: 1}
|
|
||||||
require.NoError(t, textA.Create(s, usr))
|
|
||||||
textB := &Task{Title: "Notes about " + search, ProjectID: 1}
|
|
||||||
require.NoError(t, textB.Create(s, usr))
|
|
||||||
|
|
||||||
assertIndexFirst := func(t *testing.T, tc *TaskCollection) {
|
|
||||||
got, _, _, err := tc.ReadAll(s, usr, search, 0, 50)
|
|
||||||
require.NoError(t, err)
|
|
||||||
|
|
||||||
gotTasks, is := got.([]*Task)
|
|
||||||
require.True(t, is)
|
|
||||||
|
|
||||||
gotIDs := make([]int64, len(gotTasks))
|
|
||||||
for i, tsk := range gotTasks {
|
|
||||||
gotIDs[i] = tsk.ID
|
|
||||||
}
|
|
||||||
|
|
||||||
require.Contains(t, gotIDs, exactIndex.ID, "the exact-index task should be returned")
|
|
||||||
|
|
||||||
if db.ParadeDBAvailable() {
|
|
||||||
require.NotEmpty(t, gotTasks)
|
|
||||||
assert.Equal(t, exactIndex.ID, gotTasks[0].ID, "the exact-index match should rank first")
|
|
||||||
assert.Contains(t, gotIDs, textA.ID, "text matches should also be returned, ranked after the index match")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
t.Run("no view", func(t *testing.T) {
|
|
||||||
assertIndexFirst(t, &TaskCollection{ProjectID: 1})
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("list view", func(t *testing.T) {
|
|
||||||
assertIndexFirst(t, &TaskCollection{ProjectID: 1, ProjectViewID: 1})
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue