mirror of
https://github.com/songoda/EpicHeads.git
synced 2024-12-12 12:06:46 +01:00
275 lines
6.8 KiB
Java
275 lines
6.8 KiB
Java
|
package com.songoda.epicheads.handlers;
|
||
|
|
||
|
import com.songoda.epicheads.cache.CacheHead;
|
||
|
import com.songoda.epicheads.util.Checks;
|
||
|
|
||
|
import java.util.ArrayList;
|
||
|
import java.util.Collections;
|
||
|
import java.util.List;
|
||
|
|
||
|
public final class Search {
|
||
|
|
||
|
private Query query;
|
||
|
private Query reusableQuery;
|
||
|
private double threshold;
|
||
|
|
||
|
private int[][] editDis;
|
||
|
private int editDisDim1;
|
||
|
private int editDisDim2;
|
||
|
|
||
|
private List<Substring> substrings = new ArrayList<>();
|
||
|
|
||
|
private Search(String query, double threshold) {
|
||
|
this.query = new Query(query, toWords(query));
|
||
|
this.substrings = new ArrayList<>();
|
||
|
this.reusableQuery = new Query("", null);
|
||
|
this.threshold = threshold;
|
||
|
|
||
|
getReusableArray(query.length() + 1, 38);
|
||
|
}
|
||
|
|
||
|
private int[][] getReusableArray(int dim1, int dim2) {
|
||
|
if (dim1 <= editDisDim1 && dim2 <= editDisDim2)
|
||
|
return editDis;
|
||
|
|
||
|
dim1 = Math.max(dim1, editDisDim1);
|
||
|
dim2 = Math.max(dim2, editDisDim2);
|
||
|
|
||
|
editDis = new int[dim1][dim2];
|
||
|
editDisDim1 = dim1;
|
||
|
editDisDim2 = dim2;
|
||
|
|
||
|
return editDis;
|
||
|
}
|
||
|
|
||
|
private void appendSubstring(int index, String string, int start, int end) {
|
||
|
if (index < substrings.size()) {
|
||
|
substrings.get(index).reuse(string, start, end);
|
||
|
} else {
|
||
|
substrings.add(new Substring(string, start, end));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public List<Substring> toWords(String string) {
|
||
|
int len = string.length();
|
||
|
|
||
|
int wordCount = 0;
|
||
|
int lastSplit = 0;
|
||
|
boolean inWord = false;
|
||
|
|
||
|
for (int index = 0; index < len; ++index) {
|
||
|
char ch = string.charAt(index);
|
||
|
|
||
|
if (ch == ' ') {
|
||
|
if (inWord) {
|
||
|
appendSubstring(wordCount, string, lastSplit, index);
|
||
|
wordCount += 1;
|
||
|
lastSplit = index + 1;
|
||
|
}
|
||
|
|
||
|
inWord = false;
|
||
|
} else {
|
||
|
inWord = true;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (inWord) {
|
||
|
appendSubstring(wordCount, string, lastSplit, len);
|
||
|
wordCount += 1;
|
||
|
}
|
||
|
|
||
|
return substrings.subList(0, wordCount);
|
||
|
}
|
||
|
|
||
|
public Query reuseQuery(String string) {
|
||
|
return reusableQuery.reuse(string, toWords(string));
|
||
|
}
|
||
|
|
||
|
public List<CacheHead> checkAll(Iterable<CacheHead> heads) {
|
||
|
List<Match> matches = new ArrayList<>();
|
||
|
|
||
|
for (CacheHead head : heads) {
|
||
|
double relevance = calculateRelevance(query, head);
|
||
|
|
||
|
if (relevance <= threshold)
|
||
|
continue;
|
||
|
|
||
|
matches.add(new Match(head, relevance));
|
||
|
}
|
||
|
|
||
|
Collections.sort(matches);
|
||
|
|
||
|
List<CacheHead> results = new ArrayList<>();
|
||
|
|
||
|
for (Match match : matches) {
|
||
|
results.add(match.subject);
|
||
|
}
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
|
||
|
private double calculateRelevance(Query query, CacheHead head) {
|
||
|
double relevance = calculateRelevance(query, reuseQuery(head.getName()));
|
||
|
|
||
|
for (String tag : head.getTags()) {
|
||
|
relevance = Math.max(relevance, 0.8 * calculateRelevance(query, reuseQuery(tag)));
|
||
|
}
|
||
|
|
||
|
return relevance;
|
||
|
}
|
||
|
|
||
|
private double calculateRelevance(Query query, Query subject) {
|
||
|
double similarity = calcSimilarity(query.string, subject.string);
|
||
|
|
||
|
double wordSimilarity = 0d;
|
||
|
double aggregate = 0d;
|
||
|
int count = 0;
|
||
|
|
||
|
for (Substring queryWord : query.words) {
|
||
|
double querySimilarity = 0d;
|
||
|
for (Substring subjectWord : subject.words) {
|
||
|
querySimilarity = Math.max(querySimilarity, calcSimilarity(queryWord, subjectWord));
|
||
|
}
|
||
|
aggregate += querySimilarity;
|
||
|
count += 1;
|
||
|
wordSimilarity = Math.max(wordSimilarity, querySimilarity);
|
||
|
}
|
||
|
if (count > 0) {
|
||
|
wordSimilarity = 0.9d * wordSimilarity + 0.1d * (aggregate / count);
|
||
|
}
|
||
|
return Math.max(similarity, wordSimilarity);
|
||
|
}
|
||
|
|
||
|
private double calcSimilarity(Substring query, Substring subject) {
|
||
|
int len1 = query.length();
|
||
|
int len2 = subject.length();
|
||
|
int[][] dp = getReusableArray(len1 + 1, len2 + 1);
|
||
|
for (int i = 0; i <= len1; i++) {
|
||
|
dp[i][0] = i;
|
||
|
}
|
||
|
for (int j = 0; j <= len2; j++) {
|
||
|
dp[0][j] = j;
|
||
|
}
|
||
|
for (int i = 0; i < len1; i++) {
|
||
|
char c1 = query.charAt(i);
|
||
|
for (int j = 0; j < len2; j++) {
|
||
|
char c2 = subject.charAt(j);
|
||
|
if (c1 == c2) {
|
||
|
dp[i + 1][j + 1] = dp[i][j];
|
||
|
} else {
|
||
|
int replace = dp[i][j] + 1;
|
||
|
int insert = dp[i][j + 1] + 1;
|
||
|
int delete = dp[i + 1][j] + 1;
|
||
|
int min = replace > insert ? insert : replace;
|
||
|
min = delete > min ? min : delete;
|
||
|
dp[i + 1][j + 1] = min;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
int editDistance = dp[len1][len2];
|
||
|
if (editDistance == 0)
|
||
|
return 1;
|
||
|
return 0.75d * (double) (query.length() - editDistance) / (double) query.length();
|
||
|
}
|
||
|
|
||
|
private final static class Match implements Comparable<Match> {
|
||
|
|
||
|
public final CacheHead subject;
|
||
|
public final double relevance;
|
||
|
|
||
|
private Match(CacheHead subject, double relevance) {
|
||
|
this.subject = subject;
|
||
|
this.relevance = relevance;
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public int compareTo(Match other) {
|
||
|
return Double.compare(other.relevance, relevance);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private final class Query {
|
||
|
|
||
|
public Substring string;
|
||
|
public List<Substring> words;
|
||
|
|
||
|
public Query(String string, List<Substring> words) {
|
||
|
this.string = new Substring(string);
|
||
|
this.words = words;
|
||
|
}
|
||
|
|
||
|
public Query reuse(String string, List<Substring> words) {
|
||
|
this.string.reuse(string);
|
||
|
this.words = words;
|
||
|
return this;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private static class Substring {
|
||
|
|
||
|
public String string;
|
||
|
public int start;
|
||
|
public int end;
|
||
|
|
||
|
public Substring(String string) {
|
||
|
this(string, 0, string.length());
|
||
|
}
|
||
|
|
||
|
public Substring(String string, int start, int end) {
|
||
|
reuse(string, start, end);
|
||
|
}
|
||
|
|
||
|
public Substring reuse(String string) {
|
||
|
return reuse(string, 0, string.length());
|
||
|
}
|
||
|
|
||
|
public Substring reuse(String string, int start, int end) {
|
||
|
Checks.ensureNonNull(string, "string");
|
||
|
this.string = string;
|
||
|
this.moveTo(start, end);
|
||
|
return this;
|
||
|
}
|
||
|
|
||
|
public void moveTo(int start, int end) {
|
||
|
Checks.ensureTrue(start >= 0, "start must be >= 0");
|
||
|
Checks.ensureTrue(end >= start, "end must be >= start");
|
||
|
Checks.ensureTrue(end <= string.length(), "end must be <= to the length of string");
|
||
|
this.start = start;
|
||
|
this.end = end;
|
||
|
}
|
||
|
|
||
|
public char charAt(int index) {
|
||
|
if (index < 0)
|
||
|
throw new IndexOutOfBoundsException("index cannot be negative");
|
||
|
if (index >= length())
|
||
|
throw new IndexOutOfBoundsException("index must be less than the strings length");
|
||
|
char ch = string.charAt(start + index);
|
||
|
return (char) (ch >= 'A' && ch <= 'Z' ? ch + ('a' - 'A') : ch);
|
||
|
}
|
||
|
|
||
|
public int length() {
|
||
|
return end - start;
|
||
|
}
|
||
|
|
||
|
@Override
|
||
|
public String toString() {
|
||
|
return string.substring(start, end);
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Search over the list of heads and find all heads with a relevance above a certain threshold.
|
||
|
* Will simplify the query string in an attempt to improve matches.
|
||
|
*
|
||
|
* @param query The search term.
|
||
|
* @param heads The heads we are checking for matches.
|
||
|
* @param threshold The threshold relevance that a head must have to be matched.
|
||
|
* @return All heads sorted by relevance that have a relevance greater than the threshold.
|
||
|
*/
|
||
|
public static List<CacheHead> searchHeads(String query, Iterable<CacheHead> heads, double threshold) {
|
||
|
return new Search(query, threshold).checkAll(heads);
|
||
|
}
|
||
|
|
||
|
}
|