Merge 39c5ebd699
into 32a3645a05
This commit is contained in:
commit
2d06a6b535
|
@ -0,0 +1,36 @@
|
|||
import os
|
||||
from os.path import dirname, join
|
||||
from json import dump
|
||||
from sphinx.application import Sphinx
|
||||
from sphinx.util.fileutil import copy_asset_file
|
||||
from sphinx.jinja2glue import SphinxFileSystemLoader
|
||||
|
||||
from .embed import tokenize, embed
|
||||
|
||||
is_production = os.getenv("NETLIFY") == "true"
|
||||
|
||||
def builder_inited(app):
|
||||
app.builder.templates.loaders.insert(0, SphinxFileSystemLoader(dirname(__file__)))
|
||||
|
||||
def build_finished(app: Sphinx, exception):
|
||||
embedding_collection = []
|
||||
for page in app.env.titles:
|
||||
title = app.env.titles[page].astext()
|
||||
tokens = tokenize(title)
|
||||
embedding = embed(tokens)
|
||||
if embedding:
|
||||
embedding_collection.append({ "page": f"/{page}.html", "title": title, "embedding": embedding})
|
||||
dump(embedding_collection, open(join(app.builder.outdir, "embedding-index.json"), "w"))
|
||||
|
||||
output_path = join(app.builder.outdir, "_static")
|
||||
copy_asset_file(join(dirname(__file__), "glove-50d-reduced.txt"), output_path)
|
||||
copy_asset_file(join(dirname(__file__), "searchbox.js"), output_path)
|
||||
|
||||
def setup(app):
|
||||
app.connect("builder-inited", builder_inited)
|
||||
app.connect("build-finished", build_finished)
|
||||
return {
|
||||
"version": "1.0.0",
|
||||
"parallel_read_safe": True,
|
||||
"parallel_write_safe": not is_production,
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
from os.path import join, dirname
|
||||
import re
|
||||
|
||||
embedding_path = join(dirname(__file__), "glove-50d-reduced.txt")
|
||||
|
||||
embeddings = {}
|
||||
for line in open(embedding_path, "r"):
|
||||
w, idf, *values = line.split(" ")
|
||||
embeddings[w] = (float(idf), [float(x) for x in values])
|
||||
|
||||
def tokenize(string):
|
||||
string = string.lower()
|
||||
string = re.sub(r"\n", " ", string)
|
||||
string = re.sub(r"\, ", " , ", string)
|
||||
string = re.sub(r"\. ", " . ", string)
|
||||
string = re.sub(r"['’] ", " ' ", string)
|
||||
string = re.sub(r"[\"“”] ", " '' ", string)
|
||||
string = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*|²", " <number> ", string)
|
||||
string = string.replace("b-parasite", "b parasite").replace("nfc/rfid", "nfc rfid").replace("fastled", "fast led").replace("neopixelbus", "neopixel bus").replace("neopixel", "neo pixel").replace("h-bridge", "h bridge").replace("eco_", "co").replace("co_", "co").replace("rgbw", "rgb white").replace("rgbww", "rgb cold warm").replace("rgbct", "rgb temperature brightness").replace("faqs", "frequently asked questions").replace("faq", "frequently asked questions").replace("cannot", "can not").replace("addressable", "addressed").replace("automations", "automation")
|
||||
string = re.sub(r"\bha\b", "home assistant", string)
|
||||
string = re.sub(r"\badc\b", "analog digital converter", string)
|
||||
string = re.sub(r"\s+", " ", string)
|
||||
return string.strip().split(" ")
|
||||
|
||||
def embed(tokens):
|
||||
output = [0] * 50
|
||||
total = 0
|
||||
for token in tokens:
|
||||
if token not in embeddings:
|
||||
continue
|
||||
idf, values = embeddings[token]
|
||||
|
||||
for i in range(len(values)):
|
||||
output[i] += values[i] * idf
|
||||
total += idf
|
||||
|
||||
if total == 0:
|
||||
return None
|
||||
return [round(x / total, 4) for x in output]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,19 @@
|
|||
{%- if pagename != "search" and builder != "singlehtml" %}
|
||||
<div id="searchbox" style="display: none" role="search">
|
||||
<form class="search" action="{{ pathto('search') }}" method="get">
|
||||
<input
|
||||
type="text"
|
||||
name="q"
|
||||
aria-labelledby="searchlabel"
|
||||
autocomplete="off"
|
||||
autocorrect="off"
|
||||
autocapitalize="off"
|
||||
spellcheck="false"
|
||||
placeholder="Search"
|
||||
/>
|
||||
<input type="submit" value="{{ _('Go') }}" />
|
||||
</form>
|
||||
<div class="output"></div>
|
||||
</div>
|
||||
<script type="module" src="{{ pathto('_static/searchbox.js', 1) }}"></script>
|
||||
{%- endif %}
|
|
@ -0,0 +1,112 @@
|
|||
document.getElementById("searchbox").style.display = "block";
|
||||
|
||||
const tokenize = (value) => {
|
||||
value = value
|
||||
.toLowerCase()
|
||||
.replace(/\n/g, " ")
|
||||
.replace(/[?!;@#$%&]/g, " $& ")
|
||||
.replace(/[\]\[\(\)\{\}<>]/g, " $& ")
|
||||
.replace(/('s|'m|'d|'ll|'re|'ve|n't) /gi, " $1 ")
|
||||
.replace(/\, /g, " , ")
|
||||
.replace(/\. /g, " . ")
|
||||
.replace(/['’] /g, " ' ")
|
||||
.replace(/["“”]/g, " '' ");
|
||||
value = value.replace(/[-+]?[.\d]*[\d]+[:,.\d]*|²/g, " <number> ");
|
||||
value = value
|
||||
.replaceAll("b-parasite", "b parasite")
|
||||
.replaceAll("nfc/rfid", "nfc rfid")
|
||||
.replaceAll("fastled", "fast led")
|
||||
.replaceAll("neopixelbus", "neopixel bus")
|
||||
.replaceAll("neopixel", "neo pixel")
|
||||
.replaceAll("h-bridge", "h bridge")
|
||||
.replaceAll("eco_", "co")
|
||||
.replaceAll("co_", "co")
|
||||
.replaceAll("rgbw", "rgb white")
|
||||
.replaceAll("rgbww", "rgb cold warm")
|
||||
.replaceAll("rgbct", "rgb temperature brightness")
|
||||
.replaceAll("faqs", "frequently asked questions")
|
||||
.replaceAll("faq", "frequently asked questions")
|
||||
.replaceAll("cannot", "can not")
|
||||
.replaceAll("addressable", "addressed")
|
||||
.replaceAll("automations", "automation")
|
||||
.replace(/\bha\b/g, "home assistant")
|
||||
.replace(/\badc\b/g, "analog digital converter");
|
||||
return value.replace(/\s+/g, " ").trim().split(" ");
|
||||
};
|
||||
const embed = (tokens) => {
|
||||
let output = Array.from({ length: 50 }, () => 0);
|
||||
let total = 0;
|
||||
for (let token of tokens) {
|
||||
if (!glove[token]) {
|
||||
continue;
|
||||
}
|
||||
const { idf, values } = glove[token];
|
||||
|
||||
for (let i = 0; i < values.length; i++) {
|
||||
output[i] += values[i] * idf;
|
||||
}
|
||||
total += idf;
|
||||
}
|
||||
|
||||
if (total == 0) return null;
|
||||
return output.map((x) => x / total);
|
||||
};
|
||||
const cosine = (a, b) => {
|
||||
const a_norm = a.map((x) => x * x).reduce((a, b) => a + b);
|
||||
const b_norm = b.map((x) => x * x).reduce((a, b) => a + b);
|
||||
return (
|
||||
a.map((x, i) => x * b[i]).reduce((a, b) => a + b) /
|
||||
Math.sqrt(a_norm * b_norm)
|
||||
);
|
||||
};
|
||||
|
||||
let glove = {};
|
||||
let embeddings = [];
|
||||
(async () => {
|
||||
const r = await fetch("/_static/glove-50d-reduced.txt");
|
||||
const data = await r.text();
|
||||
for (const x of data.split("\n")) {
|
||||
const [w, idf, ...values] = x.split(" ");
|
||||
glove[w] = {
|
||||
idf: parseFloat(idf),
|
||||
values: values.map((x) => parseFloat(x)),
|
||||
};
|
||||
}
|
||||
})();
|
||||
(async () => {
|
||||
const r = await fetch("/embedding-index.json");
|
||||
embeddings = await r.json();
|
||||
})();
|
||||
|
||||
const input = document.querySelector("#searchbox input");
|
||||
const output = document.querySelector("#searchbox .output");
|
||||
output.style.display = "flex";
|
||||
output.style.flexDirection = "column";
|
||||
|
||||
const item = (href, title) => {
|
||||
const a = document.createElement("a");
|
||||
a.href = href;
|
||||
a.innerText = title;
|
||||
a.style.background = "#101010";
|
||||
a.style.color = "#ececec";
|
||||
a.style.padding = "0.5rem";
|
||||
a.style.borderRadius = "1rem";
|
||||
a.style.marginTop = "0.5rem";
|
||||
return a;
|
||||
};
|
||||
input.addEventListener("input", () => {
|
||||
const tokens = tokenize(input.value);
|
||||
const embedding = embed(tokens);
|
||||
const results = embedding
|
||||
? embeddings
|
||||
.map((x) => ({
|
||||
...x,
|
||||
similarity: cosine(x.embedding, embedding),
|
||||
}))
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
: [];
|
||||
|
||||
output.replaceChildren(
|
||||
...results.slice(0, 3).map((x) => item(x.page, x.title))
|
||||
);
|
||||
});
|
|
@ -129,7 +129,7 @@ class SEODirective(Directive):
|
|||
if not image.startswith("/"):
|
||||
local_img = f"/images/{image}"
|
||||
image = "/_images/" + image
|
||||
p = Path(__file__).parent / local_img[1:]
|
||||
p = Path(__file__).parent.parent / local_img[1:]
|
||||
if not p.is_file():
|
||||
raise ValueError(f"File {p} for seo tag does not exist {self.state.document}")
|
||||
|
3
conf.py
3
conf.py
|
@ -25,7 +25,7 @@ import os
|
|||
import sys
|
||||
|
||||
|
||||
sys.path.append(os.path.abspath("."))
|
||||
sys.path.append(os.path.abspath("_extensions"))
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
|
@ -41,6 +41,7 @@ extensions = [
|
|||
"seo",
|
||||
"components",
|
||||
"sitemap",
|
||||
"embedding-search",
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
|
|
2
lint.py
2
lint.py
|
@ -320,7 +320,7 @@ def lint_newline(fname):
|
|||
return "File contains Windows newline. Please set your editor to Unix newline mode."
|
||||
|
||||
|
||||
@lint_content_check(exclude=["*.svg", "runtime.txt", "_static/*"])
|
||||
@lint_content_check(exclude=["*.svg", "runtime.txt", "_extensions/embedding-search/glove-50d-reduced.txt", "_static/*"])
|
||||
def lint_end_newline(fname, content):
|
||||
if content and not content.endswith("\n"):
|
||||
return "File does not end with a newline, please add an empty line at the end of the file."
|
||||
|
|
Loading…
Reference in New Issue