Harden web search and docs defaults

This commit is contained in:
2026-06-24 23:57:44 -07:00
parent 8fcd94d2c5
commit 8237f1331c
19 changed files with 691 additions and 35 deletions

View File

@@ -1,3 +1,4 @@
*
!Dockerfile
!entrypoint.sh
!constraints.txt

View File

@@ -1,7 +1,10 @@
FROM python:3.12-slim
FROM python:3.12-slim@sha256:6c4dd321d176d61ea848dc8c73a4f7dbae8f70e0ee48bb411ea2f045b599fa8e
ARG LLMS_TXT_MCP_VERSION=0.2.0
ARG MCP_PROXY_VERSION=0.12.0
ARG TORCH_VERSION=2.12.1+cpu
COPY constraints.txt /tmp/context-kit-docs-constraints.txt
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
@@ -11,17 +14,19 @@ RUN apt-get update \
# Install CPU-only torch first so llms-txt-mcp does not pull large CUDA wheels.
RUN pip install --no-cache-dir \
--index-url https://download.pytorch.org/whl/cpu \
torch
-c /tmp/context-kit-docs-constraints.txt \
"torch==${TORCH_VERSION}"
# llms-txt-mcp does the indexing/search; mcp-proxy fronts its stdio transport
# as Streamable HTTP so multiple MCP clients can share one long-lived process
# (and therefore one Chroma DB writer).
RUN if [ -n "${LLMS_TXT_MCP_VERSION}" ]; then \
pip install --no-cache-dir "llms-txt-mcp==${LLMS_TXT_MCP_VERSION}"; \
pip install --no-cache-dir -c /tmp/context-kit-docs-constraints.txt "llms-txt-mcp==${LLMS_TXT_MCP_VERSION}"; \
else \
pip install --no-cache-dir llms-txt-mcp; \
pip install --no-cache-dir -c /tmp/context-kit-docs-constraints.txt llms-txt-mcp; \
fi \
&& pip install --no-cache-dir "mcp-proxy==${MCP_PROXY_VERSION}"
&& pip install --no-cache-dir -c /tmp/context-kit-docs-constraints.txt "mcp-proxy==${MCP_PROXY_VERSION}" \
&& rm /tmp/context-kit-docs-constraints.txt
COPY entrypoint.sh /usr/local/bin/docs-mcp-entrypoint
RUN chmod +x /usr/local/bin/docs-mcp-entrypoint

107
docker/docs/constraints.txt Normal file
View File

@@ -0,0 +1,107 @@
aiohappyeyeballs==2.6.2
aiohttp==3.14.1
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anyio==4.14.1
attrs==26.1.0
bcrypt==5.0.0
build==1.5.0
certifi==2026.6.17
cffi==2.0.0
charset-normalizer==3.4.7
chromadb==1.5.9
click==8.4.2
cryptography==49.0.0
durationpy==0.10
filelock==3.29.0
flatbuffers==25.12.19
frozenlist==1.8.0
fsspec==2026.4.0
googleapis-common-protos==1.75.0
grpcio==1.81.1
h11==0.16.0
hf-xet==1.5.1
httpcore==1.0.9
httptools==0.8.0
httpx==0.28.1
httpx-sse==0.4.3
httpx_auth==0.23.1
huggingface_hub==1.20.1
idna==3.18
importlib_resources==7.1.0
Jinja2==3.1.6
joblib==1.5.3
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
kubernetes==36.0.2
llms-txt-mcp==0.2.0
markdown-it-py==4.2.0
MarkupSafe==3.0.3
mcp==1.28.0
mcp-proxy==0.12.0
mdurl==0.1.2
mmh3==5.2.1
mpmath==1.3.0
multidict==6.7.1
narwhals==2.22.1
networkx==3.6.1
numpy==2.5.0
oauthlib==3.3.1
onnxruntime==1.27.0
opentelemetry-api==1.43.0
opentelemetry-exporter-otlp-proto-common==1.43.0
opentelemetry-exporter-otlp-proto-grpc==1.43.0
opentelemetry-proto==1.43.0
opentelemetry-sdk==1.43.0
opentelemetry-semantic-conventions==0.64b0
orjson==3.11.9
overrides==7.7.0
packaging==26.2
propcache==0.5.2
protobuf==7.35.1
pybase64==1.4.3
pycparser==3.0
pydantic==2.13.4
pydantic-settings==2.14.2
pydantic_core==2.46.4
Pygments==2.20.0
PyJWT==2.13.0
PyPika==0.51.1
pyproject_hooks==1.2.0
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
python-multipart==0.0.32
PyYAML==6.0.3
referencing==0.37.0
regex==2026.5.9
requests==2.34.2
requests-oauthlib==2.0.0
rich==15.0.0
rpds-py==2026.5.1
safetensors==0.8.0
scikit-learn==1.9.0
scipy==1.18.0
sentence-transformers==5.6.0
setuptools==70.2.0
shellingham==1.5.4
six==1.17.0
sse-starlette==3.4.5
starlette==1.3.1
sympy==1.14.0
tenacity==9.1.4
threadpoolctl==3.6.0
tokenizers==0.22.2
torch==2.12.1+cpu
tqdm==4.68.3
transformers==5.12.1
typer==0.25.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.7.0
uvicorn==0.49.0
uvloop==0.22.1
watchfiles==1.2.0
websocket-client==1.9.0
websockets==16.0
yarl==1.24.2

View File

@@ -12,6 +12,8 @@
set -eu
sources_file="${DOCS_MCP_SOURCES_FILE:-/etc/context-kit/docs-sources.txt}"
local_sources_dir="${DOCS_MCP_LOCAL_SOURCES_DIR:-/etc/context-kit/local-sources}"
local_sources_port="${DOCS_MCP_LOCAL_SOURCES_PORT:-8769}"
if [ ! -r "$sources_file" ]; then
echo "docs-mcp: sources file not readable: $sources_file" >&2
@@ -27,11 +29,41 @@ if [ -z "$sources" ]; then
exit 64
fi
if [ -d "$local_sources_dir" ]; then
python -m http.server "$local_sources_port" \
--bind 127.0.0.1 \
--directory "$local_sources_dir" \
>/tmp/context-kit-local-sources.log 2>&1 &
local_sources_pid="$!"
if ! python - "$local_sources_port" <<'PY'
import sys
import time
import urllib.request
port = sys.argv[1]
last_error = None
for _ in range(20):
try:
with urllib.request.urlopen(f"http://127.0.0.1:{port}/", timeout=0.5) as response:
if response.status < 500:
raise SystemExit(0)
except Exception as error:
last_error = error
time.sleep(0.1)
raise SystemExit(f"local source server did not become ready: {last_error}")
PY
then
kill "$local_sources_pid" 2>/dev/null || true
echo "docs-mcp: local source server failed on 127.0.0.1:$local_sources_port" >&2
exit 65
fi
fi
# By default llms-txt-mcp 0.2.0 re-embeds every source on launch (the actual
# default is a background preindex, --no-preindex only disables the foreground
# variant). On a long-lived container that just wastes ~5 min of CPU per
# restart, so we disable BOTH and let the caller use `docs_refresh` on demand.
# Set DOCS_MCP_PREINDEX=1 to restore the eager behavior.
# variant). On a long-lived container that wastes CPU per restart, so we disable
# BOTH. Missing/stale sources still refresh on first docs_query/docs_refresh.
# Set DOCS_MCP_PREINDEX=1 to restore eager startup indexing.
preindex_flag="--no-preindex --no-background-preindex"
if [ "${DOCS_MCP_PREINDEX:-0}" = "1" ]; then
preindex_flag=""

View File

@@ -1,2 +1,5 @@
*
!Dockerfile
!patch-mcp-web-search.mjs
!overrides/
!overrides/bing.js

View File

@@ -1,7 +1,14 @@
FROM node:22-bookworm-slim
FROM node:22-bookworm-slim@sha256:813a7480f28fdadac1f7f5c824bcdad435b5bc1322a5968bbbdef8d058f9dff4
ARG MCP_WEB_SEARCH_VERSION=1.3.0
ARG MCP_WEB_SEARCH_MAX_BYTES=52428800
COPY patch-mcp-web-search.mjs /tmp/patch-mcp-web-search.mjs
COPY overrides/bing.js /tmp/context-kit-bing-provider.js
# Chromium intentionally tracks Debian security updates inside the pinned base
# image family; Bing's browser path is more likely to break with stale Chromium
# than with patched OS packages.
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates \
@@ -10,11 +17,15 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/*
RUN npm install -g "@zhafron/mcp-web-search@${MCP_WEB_SEARCH_VERSION}" \
&& cp /tmp/context-kit-bing-provider.js /usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/providers/bing.js \
&& node /tmp/patch-mcp-web-search.mjs \
&& rm /tmp/patch-mcp-web-search.mjs /tmp/context-kit-bing-provider.js \
&& npm cache clean --force
ENV CHROME_PATH=/usr/bin/chromium \
DEFAULT_SEARCH_PROVIDER=searxng \
HTTP_TIMEOUT=15000 \
MAX_BYTES=${MCP_WEB_SEARCH_MAX_BYTES} \
MAX_RESULTS=10 \
SEARXNG_URL=http://searxng:8080

View File

@@ -0,0 +1,114 @@
import { PUPPETEER_TIMEOUT } from "../constants.js";
import { browserPool } from "../utils/browser-pool.js";
import { getAcceptLanguageHeader, getMarketFromLang } from "../utils/user-agent.js";
import { searchCache, createCacheKey } from "../utils/cache.js";
// Context Kit override for @zhafron/mcp-web-search 1.3.0.
// The upstream provider can read Bing before result cards render and return an
// empty fallback. Keep this as a direct provider replacement until upstream
// waits for cards and decodes current /ck/a redirects reliably.
const DEFAULT_BROWSER_SEARCH_USER_AGENT = process.env.BROWSER_SEARCH_USER_AGENT ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36";
function decodeBase64Url(value) {
const normalized = value.replace(/-/g, "+").replace(/_/g, "/");
const padded = normalized.padEnd(normalized.length + ((4 - normalized.length % 4) % 4), "=");
return Buffer.from(padded, "base64").toString("utf-8");
}
export class BingProvider {
name = "bing";
decodeBingRedirect(href) {
try {
const url = new URL(href, "https://www.bing.com/");
if (url.hostname === "www.bing.com" && url.pathname === "/ck/a") {
const encoded = url.searchParams.get("u");
if (encoded) {
const candidates = [encoded];
if (/^[a-z][0-9]/i.test(encoded)) candidates.push(encoded.slice(2));
for (const candidate of candidates) {
try {
const decoded = decodeBase64Url(candidate);
if (/^https?:\/\//i.test(decoded)) return decoded;
}
catch { }
}
}
}
return url.toString();
}
catch {
return href;
}
}
async search(q, limit, lang) {
const cacheKey = createCacheKey("bing", q, limit, lang);
const cached = searchCache.get(cacheKey);
if (cached)
return cached;
const market = getMarketFromLang(lang);
const results = await browserPool.withBrowser(async (browser) => {
const page = await browser.newPage();
try {
await page.setViewport({ width: 1365, height: 768 });
await page.setUserAgent(DEFAULT_BROWSER_SEARCH_USER_AGENT);
await page.setExtraHTTPHeaders(getAcceptLanguageHeader(lang));
const url = new URL("https://www.bing.com/search");
url.searchParams.set("q", q);
url.searchParams.set("mkt", market);
const response = await page.goto(url.toString(), {
waitUntil: "domcontentloaded",
timeout: PUPPETEER_TIMEOUT
});
if (response && response.status() >= 400) {
throw new Error(`Bing HTTP ${response.status()}`);
}
await page.waitForSelector("li.b_algo h2 a[href], li.b_algo a[href]", { timeout: 10000 }).catch(() => undefined);
const items = await page.evaluate(maxResults => {
const parsed = [];
for (const card of Array.from(document.querySelectorAll("li.b_algo"))) {
const anchor = card.querySelector("h2 a[href]") || card.querySelector("a[href]");
const title = anchor?.textContent?.trim() || "";
const href = anchor?.getAttribute("href") || "";
if (!title || !href)
continue;
const snippetElement = card.querySelector("div.b_caption p, div.b_snippet, p");
const snippet = snippetElement?.textContent?.trim() || undefined;
parsed.push({ title, url: href, snippet });
if (parsed.length >= maxResults)
break;
}
return parsed;
}, limit);
return items.flatMap(result => {
try {
const absolute = new URL(result.url, "https://www.bing.com/").toString();
const decoded = this.decodeBingRedirect(absolute);
new URL(decoded);
return [{ ...result, url: decoded, source: "bing" }];
}
catch {
return [];
}
});
}
finally {
await page.close();
}
});
searchCache.set(cacheKey, results);
return results;
}
async isAvailable() {
try {
await browserPool.getBrowser();
return true;
}
catch {
return false;
}
}
}

View File

@@ -0,0 +1,28 @@
import fs from "node:fs";
// Context Kit patch for @zhafron/mcp-web-search 1.3.0.
// Upstream hard-codes the fetch_url schema limit to 25 MiB even though the
// runtime extractor already uses MAX_BYTES. Keep this narrow and fail the build
// if upstream changes the compiled source shape.
const serverPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/server.js";
let source = fs.readFileSync(serverPath, "utf8");
const replacements = [
[
'import { MAX_RESULTS } from "./constants.js";',
'import { MAX_BYTES, MAX_RESULTS } from "./constants.js";'
],
[
"max_download_bytes: z.number().int().min(1).max(26214400).optional()",
"max_download_bytes: z.number().int().min(1).max(MAX_BYTES).optional()"
]
];
for (const [before, after] of replacements) {
if (!source.includes(before)) {
throw new Error(`mcp-web-search patch target not found: ${before}`);
}
source = source.replace(before, after);
}
fs.writeFileSync(serverPath, source);

View File

@@ -15,8 +15,8 @@ search:
- json
server:
# Local placeholder. The Docker service also sets SEARXNG_SECRET from .env;
# keep SearXNG bound to 127.0.0.1 unless you review this config separately.
# Local placeholder. Keep SearXNG bound to 127.0.0.1 unless you review this
# config and replace the secret_key for a deliberate non-local deployment.
secret_key: "local-only-change-if-exposed"
limiter: false
image_proxy: true