This commit is contained in:
Kendell R 2024-05-07 14:45:02 +02:00 committed by GitHub
commit 2d06a6b535
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 3394 additions and 3 deletions

View File

@ -0,0 +1,36 @@
import os
from os.path import dirname, join
from json import dump
from sphinx.application import Sphinx
from sphinx.util.fileutil import copy_asset_file
from sphinx.jinja2glue import SphinxFileSystemLoader
from .embed import tokenize, embed
is_production = os.getenv("NETLIFY") == "true"
def builder_inited(app):
app.builder.templates.loaders.insert(0, SphinxFileSystemLoader(dirname(__file__)))
def build_finished(app: Sphinx, exception):
embedding_collection = []
for page in app.env.titles:
title = app.env.titles[page].astext()
tokens = tokenize(title)
embedding = embed(tokens)
if embedding:
embedding_collection.append({ "page": f"/{page}.html", "title": title, "embedding": embedding})
dump(embedding_collection, open(join(app.builder.outdir, "embedding-index.json"), "w"))
output_path = join(app.builder.outdir, "_static")
copy_asset_file(join(dirname(__file__), "glove-50d-reduced.txt"), output_path)
copy_asset_file(join(dirname(__file__), "searchbox.js"), output_path)
def setup(app):
app.connect("builder-inited", builder_inited)
app.connect("build-finished", build_finished)
return {
"version": "1.0.0",
"parallel_read_safe": True,
"parallel_write_safe": not is_production,
}

View File

@ -0,0 +1,39 @@
from os.path import join, dirname
import re
embedding_path = join(dirname(__file__), "glove-50d-reduced.txt")
embeddings = {}
for line in open(embedding_path, "r"):
w, idf, *values = line.split(" ")
embeddings[w] = (float(idf), [float(x) for x in values])
def tokenize(string):
string = string.lower()
string = re.sub(r"\n", " ", string)
string = re.sub(r"\, ", " , ", string)
string = re.sub(r"\. ", " . ", string)
string = re.sub(r"['] ", " ' ", string)
string = re.sub(r"[\"“”] ", " '' ", string)
string = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*|²", " <number> ", string)
string = string.replace("b-parasite", "b parasite").replace("nfc/rfid", "nfc rfid").replace("fastled", "fast led").replace("neopixelbus", "neopixel bus").replace("neopixel", "neo pixel").replace("h-bridge", "h bridge").replace("eco_", "co").replace("co_", "co").replace("rgbw", "rgb white").replace("rgbww", "rgb cold warm").replace("rgbct", "rgb temperature brightness").replace("faqs", "frequently asked questions").replace("faq", "frequently asked questions").replace("cannot", "can not").replace("addressable", "addressed").replace("automations", "automation")
string = re.sub(r"\bha\b", "home assistant", string)
string = re.sub(r"\badc\b", "analog digital converter", string)
string = re.sub(r"\s+", " ", string)
return string.strip().split(" ")
def embed(tokens):
output = [0] * 50
total = 0
for token in tokens:
if token not in embeddings:
continue
idf, values = embeddings[token]
for i in range(len(values)):
output[i] += values[i] * idf
total += idf
if total == 0:
return None
return [round(x / total, 4) for x in output]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
{%- if pagename != "search" and builder != "singlehtml" %}
<div id="searchbox" style="display: none" role="search">
<form class="search" action="{{ pathto('search') }}" method="get">
<input
type="text"
name="q"
aria-labelledby="searchlabel"
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"
placeholder="Search"
/>
<input type="submit" value="{{ _('Go') }}" />
</form>
<div class="output"></div>
</div>
<script type="module" src="{{ pathto('_static/searchbox.js', 1) }}"></script>
{%- endif %}

View File

@ -0,0 +1,112 @@
document.getElementById("searchbox").style.display = "block";
const tokenize = (value) => {
value = value
.toLowerCase()
.replace(/\n/g, " ")
.replace(/[?!;@#$%&]/g, " $& ")
.replace(/[\]\[\(\)\{\}<>]/g, " $& ")
.replace(/('s|'m|'d|'ll|'re|'ve|n't) /gi, " $1 ")
.replace(/\, /g, " , ")
.replace(/\. /g, " . ")
.replace(/['] /g, " ' ")
.replace(/["“”]/g, " '' ");
value = value.replace(/[-+]?[.\d]*[\d]+[:,.\d]*|²/g, " <number> ");
value = value
.replaceAll("b-parasite", "b parasite")
.replaceAll("nfc/rfid", "nfc rfid")
.replaceAll("fastled", "fast led")
.replaceAll("neopixelbus", "neopixel bus")
.replaceAll("neopixel", "neo pixel")
.replaceAll("h-bridge", "h bridge")
.replaceAll("eco_", "co")
.replaceAll("co_", "co")
.replaceAll("rgbw", "rgb white")
.replaceAll("rgbww", "rgb cold warm")
.replaceAll("rgbct", "rgb temperature brightness")
.replaceAll("faqs", "frequently asked questions")
.replaceAll("faq", "frequently asked questions")
.replaceAll("cannot", "can not")
.replaceAll("addressable", "addressed")
.replaceAll("automations", "automation")
.replace(/\bha\b/g, "home assistant")
.replace(/\badc\b/g, "analog digital converter");
return value.replace(/\s+/g, " ").trim().split(" ");
};
const embed = (tokens) => {
let output = Array.from({ length: 50 }, () => 0);
let total = 0;
for (let token of tokens) {
if (!glove[token]) {
continue;
}
const { idf, values } = glove[token];
for (let i = 0; i < values.length; i++) {
output[i] += values[i] * idf;
}
total += idf;
}
if (total == 0) return null;
return output.map((x) => x / total);
};
const cosine = (a, b) => {
const a_norm = a.map((x) => x * x).reduce((a, b) => a + b);
const b_norm = b.map((x) => x * x).reduce((a, b) => a + b);
return (
a.map((x, i) => x * b[i]).reduce((a, b) => a + b) /
Math.sqrt(a_norm * b_norm)
);
};
let glove = {};
let embeddings = [];
(async () => {
const r = await fetch("/_static/glove-50d-reduced.txt");
const data = await r.text();
for (const x of data.split("\n")) {
const [w, idf, ...values] = x.split(" ");
glove[w] = {
idf: parseFloat(idf),
values: values.map((x) => parseFloat(x)),
};
}
})();
(async () => {
const r = await fetch("/embedding-index.json");
embeddings = await r.json();
})();
const input = document.querySelector("#searchbox input");
const output = document.querySelector("#searchbox .output");
output.style.display = "flex";
output.style.flexDirection = "column";
const item = (href, title) => {
const a = document.createElement("a");
a.href = href;
a.innerText = title;
a.style.background = "#101010";
a.style.color = "#ececec";
a.style.padding = "0.5rem";
a.style.borderRadius = "1rem";
a.style.marginTop = "0.5rem";
return a;
};
input.addEventListener("input", () => {
const tokens = tokenize(input.value);
const embedding = embed(tokens);
const results = embedding
? embeddings
.map((x) => ({
...x,
similarity: cosine(x.embedding, embedding),
}))
.sort((a, b) => b.similarity - a.similarity)
: [];
output.replaceChildren(
...results.slice(0, 3).map((x) => item(x.page, x.title))
);
});

View File

@ -129,7 +129,7 @@ class SEODirective(Directive):
if not image.startswith("/"):
local_img = f"/images/{image}"
image = "/_images/" + image
p = Path(__file__).parent / local_img[1:]
p = Path(__file__).parent.parent / local_img[1:]
if not p.is_file():
raise ValueError(f"File {p} for seo tag does not exist {self.state.document}")

View File

@ -25,7 +25,7 @@ import os
import sys
sys.path.append(os.path.abspath("."))
sys.path.append(os.path.abspath("_extensions"))
# -- General configuration ------------------------------------------------
@ -41,6 +41,7 @@ extensions = [
"seo",
"components",
"sitemap",
"embedding-search",
]
# Add any paths that contain templates here, relative to this directory.

View File

@ -320,7 +320,7 @@ def lint_newline(fname):
return "File contains Windows newline. Please set your editor to Unix newline mode."
@lint_content_check(exclude=["*.svg", "runtime.txt", "_static/*"])
@lint_content_check(exclude=["*.svg", "runtime.txt", "_extensions/embedding-search/glove-50d-reduced.txt", "_static/*"])
def lint_end_newline(fname, content):
if content and not content.endswith("\n"):
return "File does not end with a newline, please add an empty line at the end of the file."