mirror of https://github.com/djcb/mu.git
Avoid word-splitting regular expression matches
Previously, we would conduct regular expression searches by enumerating all values of a given term, manually regex-matching each one against our search regular expression, remember all the term values that matched our regular expression, then do a big Xapian OR-query that matched any of those term values. In constructing this OR-query, however, we would split each term value on space and add a separate Xapian phrase search term for each resulting word. This approach worked fine most of the time, beacuse when we index a term, we index both each word in a term and the whole term by itself. This word splitting produced false negatives in some matches, however, because Xapian and the Mu-level word splitting code do word splitting slightly differently and apply different transformations to the text while splitting. (For example, Xapian transforms fancy Unicode apostrophes to ASCII apostrophes.) This patch avoids the problem by not word splitting when constructing the big Xapian OR-query for finding the results of regular expression matching.
This commit is contained in:
parent
50489fe6bb
commit
26b3110b8f
|
@ -267,7 +267,7 @@ Parser::Private::regex(const FieldInfoVec& fields,
|
||||||
for (const auto& field : fields) {
|
for (const auto& field : fields) {
|
||||||
const auto terms = process_regex(field.field, rx);
|
const auto terms = process_regex(field.field, rx);
|
||||||
for (const auto& term : terms) {
|
for (const auto& term : terms) {
|
||||||
tree.add_child(Tree({Node::Type::Value,
|
tree.add_child(Tree({Node::Type::ValueAtomic,
|
||||||
FieldValue{field.id, term}}));
|
FieldValue{field.id, term}}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,6 +80,7 @@ struct Node {
|
||||||
OpAndNot,
|
OpAndNot,
|
||||||
OpNot,
|
OpNot,
|
||||||
Value,
|
Value,
|
||||||
|
ValueAtomic,
|
||||||
Range,
|
Range,
|
||||||
Invalid
|
Invalid
|
||||||
};
|
};
|
||||||
|
@ -107,6 +108,8 @@ struct Node {
|
||||||
return "not";
|
return "not";
|
||||||
case Type::Value:
|
case Type::Value:
|
||||||
return "value";
|
return "value";
|
||||||
|
case Type::ValueAtomic:
|
||||||
|
return "value_atomic";
|
||||||
case Type::Range:
|
case Type::Range:
|
||||||
return "range";
|
return "range";
|
||||||
case Type::Invalid:
|
case Type::Invalid:
|
||||||
|
|
|
@ -83,11 +83,15 @@ xapian_query_value(const Mu::Tree& tree)
|
||||||
return make_query(field_val, true /*maybe-wildcard*/);
|
return make_query(field_val, true /*maybe-wildcard*/);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const bool is_atomic = tree.node.type == Node::Type::ValueAtomic;
|
||||||
|
|
||||||
const auto parts{split(field_val.value(), " ")};
|
const auto parts{split(field_val.value(), " ")};
|
||||||
if (parts.empty())
|
if (parts.empty())
|
||||||
return Xapian::Query::MatchNothing; // shouldn't happen
|
return Xapian::Query::MatchNothing; // shouldn't happen
|
||||||
else if (parts.size() == 1)
|
else if (parts.size() == 1 && !is_atomic)
|
||||||
return make_query(field_val, true /*maybe-wildcard*/);
|
return make_query(field_val, true /*maybe-wildcard*/);
|
||||||
|
else if (is_atomic)
|
||||||
|
return make_query(field_val, false /*maybe-wildcard*/);
|
||||||
|
|
||||||
std::vector<Xapian::Query> phvec;
|
std::vector<Xapian::Query> phvec;
|
||||||
for (const auto& p : parts) {
|
for (const auto& p : parts) {
|
||||||
|
@ -124,6 +128,7 @@ Mu::xapian_query(const Mu::Tree& tree)
|
||||||
case Node::Type::OpAndNot:
|
case Node::Type::OpAndNot:
|
||||||
return xapian_query_op(tree);
|
return xapian_query_op(tree);
|
||||||
case Node::Type::Value:
|
case Node::Type::Value:
|
||||||
|
case Node::Type::ValueAtomic:
|
||||||
return xapian_query_value(tree);
|
return xapian_query_value(tree);
|
||||||
case Node::Type::Range:
|
case Node::Type::Range:
|
||||||
return xapian_query_range(tree);
|
return xapian_query_range(tree);
|
||||||
|
|
Loading…
Reference in New Issue