Harden web search and docs defaults

This commit is contained in:
2026-06-24 23:57:44 -07:00
parent 8fcd94d2c5
commit 8237f1331c
19 changed files with 691 additions and 35 deletions

View File

@@ -10,9 +10,19 @@ CONTEXT_KIT_COMPOSE_PROJECT=context-kit
# Local SearXNG port. Bound to 127.0.0.1 only. # Local SearXNG port. Bound to 127.0.0.1 only.
CONTEXT_KIT_SEARXNG_PORT=8099 CONTEXT_KIT_SEARXNG_PORT=8099
# Local-only SearXNG secret. Set this to any random string if you expose SearXNG # Max bytes accepted and downloaded by context-web-search fetch_url.
# beyond localhost, which the default setup does not do. # Keep this aligned with agent tool-call defaults to avoid schema rejections.
CONTEXT_KIT_SEARXNG_SECRET=change-me-local-only CONTEXT_KIT_WEB_SEARCH_MAX_BYTES=52428800
# Web-search defaults. Search uses SearXNG first, then falls back to
# DuckDuckGo and Bing. Bing requires Chromium inside the web-search image.
CONTEXT_KIT_WEB_SEARCH_PROVIDER=searxng
CONTEXT_KIT_WEB_SEARCH_HTTP_TIMEOUT=15000
CONTEXT_KIT_WEB_SEARCH_MAX_RESULTS=10
CONTEXT_KIT_WEB_SEARCH_CHROME_PATH=/usr/bin/chromium
# User agent used by the Chromium-backed Bing search fallback.
# CONTEXT_KIT_WEB_SEARCH_BROWSER_USER_AGENT="Mozilla/5.0 ..."
# CONTEXT_KIT_WEB_SEARCH_MCP_COMPAT_MODE=legacy
# Long-lived context-docs HTTP MCP service. Bound to 127.0.0.1 only. # Long-lived context-docs HTTP MCP service. Bound to 127.0.0.1 only.
CONTEXT_KIT_DOCS_PORT=8776 CONTEXT_KIT_DOCS_PORT=8776
@@ -33,3 +43,8 @@ CONTEXT_KIT_DOCS_EMBED_MODEL=BAAI/bge-small-en-v1.5
# One or more source files, separated by spaces. # One or more source files, separated by spaces.
CONTEXT_KIT_DOCS_SOURCES=config/sources.default.txt CONTEXT_KIT_DOCS_SOURCES=config/sources.default.txt
# Optional machine-local llms.txt tree. Files are served only inside docs-mcp at
# http://127.0.0.1:8769/ so absolute local paths do not leak into source files.
# CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR=/path/to/context-kit-local-sources
# CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT=8769

View File

@@ -10,7 +10,7 @@ Context Kit gives coding agents three local MCP servers:
| Server | Purpose | Default | | Server | Purpose | Default |
|---|---|---| |---|---|---|
| `context-web-search` | Current web search and URL fetch through local SearXNG | Enabled | | `context-web-search` | Current web search through local SearXNG plus URL fetch/extract | Enabled |
| `context-docs` | Semantic search over curated `llms.txt` documentation | Enabled | | `context-docs` | Semantic search over curated `llms.txt` documentation | Enabled |
| `context-repomix` | Pack local or remote repositories into AI-friendly context | Enabled | | `context-repomix` | Pack local or remote repositories into AI-friendly context | Enabled |
@@ -56,6 +56,10 @@ config that will not be committed.
## Defaults ## Defaults
- SearXNG binds to `127.0.0.1:8099` only. - SearXNG binds to `127.0.0.1:8099` only.
- `context-web-search` defaults `search_web` to SearXNG, then falls back to
DuckDuckGo and Bing. Bing uses Chromium inside the web-search image.
- `fetch_url` uses upstream HTTP extraction. In `mcp-web-search` 1.3.0,
`engine=browser` is accepted but does not invoke Chromium yet.
- `context-docs` runs as a long-lived service on `127.0.0.1:8776` (Streamable - `context-docs` runs as a long-lived service on `127.0.0.1:8776` (Streamable
HTTP MCP) so every client shares one indexer and one Chroma writer. The HTTP MCP) so every client shares one indexer and one Chroma writer. The
`bin/context-kit docs` stdio command is kept as a compatibility shim for `bin/context-kit docs` stdio command is kept as a compatibility shim for
@@ -74,7 +78,6 @@ The default docs index is intentionally small:
- Claude Code docs - Claude Code docs
- OpenAI API docs and reference - OpenAI API docs and reference
- Anthropic docs
- OpenRouter docs - OpenRouter docs
- Model Context Protocol docs - Model Context Protocol docs
@@ -91,8 +94,8 @@ CONTEXT_KIT_DOCS_SOURCES="config/sources.default.txt config/sources.js.txt" \
bin/context-kit docs bin/context-kit docs
``` ```
Cloudflare is opt-in because it can expand to thousands of sections and take a Large vendor feeds are opt-in because they can expand to thousands of sections
while to embed. and take a while to embed.
## Commands ## Commands

View File

@@ -1,7 +1,17 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" SCRIPT_PATH="${BASH_SOURCE[0]}"
while [[ -L "${SCRIPT_PATH}" ]]; do
SCRIPT_DIR="$(cd -P "$(dirname "${SCRIPT_PATH}")" && pwd)"
SCRIPT_TARGET="$(readlink "${SCRIPT_PATH}")"
if [[ "${SCRIPT_TARGET}" = /* ]]; then
SCRIPT_PATH="${SCRIPT_TARGET}"
else
SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_TARGET}"
fi
done
ROOT="$(cd -P "$(dirname "${SCRIPT_PATH}")/.." && pwd)"
ENV_FILE="${ROOT}/.env" ENV_FILE="${ROOT}/.env"
load_env_file() { load_env_file() {
@@ -39,10 +49,19 @@ NETWORK="${CONTEXT_KIT_DOCKER_NETWORK:-${PROJECT}_default}"
SEARXNG_PORT="${CONTEXT_KIT_SEARXNG_PORT:-8099}" SEARXNG_PORT="${CONTEXT_KIT_SEARXNG_PORT:-8099}"
DOCS_PORT="${CONTEXT_KIT_DOCS_PORT:-8776}" DOCS_PORT="${CONTEXT_KIT_DOCS_PORT:-8776}"
DOCS_HTTP_URL="${CONTEXT_KIT_DOCS_HTTP_URL:-http://127.0.0.1:${DOCS_PORT}/mcp}" DOCS_HTTP_URL="${CONTEXT_KIT_DOCS_HTTP_URL:-http://127.0.0.1:${DOCS_PORT}/mcp}"
WEB_SEARCH_MAX_BYTES="${CONTEXT_KIT_WEB_SEARCH_MAX_BYTES:-52428800}"
WEB_SEARCH_PROVIDER="${CONTEXT_KIT_WEB_SEARCH_PROVIDER:-${DEFAULT_SEARCH_PROVIDER:-searxng}}"
WEB_SEARCH_HTTP_TIMEOUT="${CONTEXT_KIT_WEB_SEARCH_HTTP_TIMEOUT:-${HTTP_TIMEOUT:-15000}}"
WEB_SEARCH_MAX_RESULTS="${CONTEXT_KIT_WEB_SEARCH_MAX_RESULTS:-${MAX_RESULTS:-10}}"
WEB_SEARCH_CHROME_PATH="${CONTEXT_KIT_WEB_SEARCH_CHROME_PATH:-${CHROME_PATH:-/usr/bin/chromium}}"
WEB_SEARCH_BROWSER_USER_AGENT="${CONTEXT_KIT_WEB_SEARCH_BROWSER_USER_AGENT:-${BROWSER_SEARCH_USER_AGENT:-}}"
WEB_SEARCH_MCP_COMPAT_MODE="${CONTEXT_KIT_WEB_SEARCH_MCP_COMPAT_MODE:-${MCP_COMPAT_MODE:-}}"
DOCS_CONTAINER_NAME="context-kit-docs-mcp" DOCS_CONTAINER_NAME="context-kit-docs-mcp"
DOCS_SOURCES_FILE="${DATA_DIR}/docs-sources.txt" DOCS_SOURCES_FILE="${DATA_DIR}/docs-sources.txt"
DOCS_DATA_DIR="${DATA_DIR}/docs" DOCS_DATA_DIR="${DATA_DIR}/docs"
MODELS_DATA_DIR="${DATA_DIR}/models" MODELS_DATA_DIR="${DATA_DIR}/models"
DOCS_LOCAL_SOURCES_DIR="${CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR:-${DATA_DIR}/local-sources}"
DOCS_LOCAL_SOURCES_PORT="${CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT:-8769}"
WEB_SEARCH_IMAGE="${CONTEXT_KIT_WEB_SEARCH_IMAGE:-context-kit/web-search-mcp:latest}" WEB_SEARCH_IMAGE="${CONTEXT_KIT_WEB_SEARCH_IMAGE:-context-kit/web-search-mcp:latest}"
DOCS_IMAGE="${CONTEXT_KIT_DOCS_IMAGE:-context-kit/docs-mcp:latest}" DOCS_IMAGE="${CONTEXT_KIT_DOCS_IMAGE:-context-kit/docs-mcp:latest}"
@@ -86,6 +105,8 @@ compose() {
CONTEXT_KIT_DOCS_MAX_GET_BYTES="${CONTEXT_KIT_DOCS_MAX_GET_BYTES:-75000}" \ CONTEXT_KIT_DOCS_MAX_GET_BYTES="${CONTEXT_KIT_DOCS_MAX_GET_BYTES:-75000}" \
CONTEXT_KIT_DOCS_EMBED_MODEL="${CONTEXT_KIT_DOCS_EMBED_MODEL:-BAAI/bge-small-en-v1.5}" \ CONTEXT_KIT_DOCS_EMBED_MODEL="${CONTEXT_KIT_DOCS_EMBED_MODEL:-BAAI/bge-small-en-v1.5}" \
CONTEXT_KIT_DOCS_PREINDEX="${CONTEXT_KIT_DOCS_PREINDEX:-0}" \ CONTEXT_KIT_DOCS_PREINDEX="${CONTEXT_KIT_DOCS_PREINDEX:-0}" \
CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR="${DOCS_LOCAL_SOURCES_DIR}" \
CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT="${DOCS_LOCAL_SOURCES_PORT}" \
BUILDX_BUILDER="${CONTEXT_KIT_BUILDX_BUILDER:-${BUILDX_BUILDER:-default}}" \ BUILDX_BUILDER="${CONTEXT_KIT_BUILDX_BUILDER:-${BUILDX_BUILDER:-default}}" \
docker compose -p "${PROJECT}" -f "${COMPOSE_FILE}" "$@" docker compose -p "${PROJECT}" -f "${COMPOSE_FILE}" "$@"
} }
@@ -112,11 +133,12 @@ prepare_data_dirs() {
ensure_writable_dir "${DATA_DIR}" ensure_writable_dir "${DATA_DIR}"
ensure_writable_dir "${DOCS_DATA_DIR}" ensure_writable_dir "${DOCS_DATA_DIR}"
ensure_writable_dir "${MODELS_DATA_DIR}" ensure_writable_dir "${MODELS_DATA_DIR}"
ensure_writable_dir "${DOCS_LOCAL_SOURCES_DIR}"
} }
check_data_dirs() { check_data_dirs() {
local ok=0 dir local ok=0 dir
for dir in "${DATA_DIR}" "${DOCS_DATA_DIR}" "${MODELS_DATA_DIR}"; do for dir in "${DATA_DIR}" "${DOCS_DATA_DIR}" "${MODELS_DATA_DIR}" "${DOCS_LOCAL_SOURCES_DIR}"; do
if [[ ! -d "${dir}" ]]; then if [[ ! -d "${dir}" ]]; then
printf 'warn data directory missing: %s (run context-kit start)\n' "${dir}" printf 'warn data directory missing: %s (run context-kit start)\n' "${dir}"
elif [[ -w "${dir}" && -x "${dir}" ]]; then elif [[ -w "${dir}" && -x "${dir}" ]]; then
@@ -129,6 +151,41 @@ check_data_dirs() {
return "${ok}" return "${ok}"
} }
check_web_search_schema_patch() {
docker run --rm --entrypoint node \
-e MAX_BYTES="${WEB_SEARCH_MAX_BYTES}" \
-e EXPECTED_MAX_BYTES="${WEB_SEARCH_MAX_BYTES}" \
"${WEB_SEARCH_IMAGE}" \
-e '
const fs = require("node:fs");
const expected = Number(process.env.EXPECTED_MAX_BYTES) || 0;
const actual = Number(process.env.MAX_BYTES) || 0;
const serverPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/server.js";
const source = fs.readFileSync(serverPath, "utf8");
if (actual !== expected) process.exit(1);
if (!source.includes("max_download_bytes: z.number().int().min(1).max(MAX_BYTES).optional()")) process.exit(1);
' >/dev/null 2>&1
}
check_web_search_bing_override() {
docker run --rm --entrypoint node \
"${WEB_SEARCH_IMAGE}" \
-e '
const fs = require("node:fs");
const bingPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/providers/bing.js";
const source = fs.readFileSync(bingPath, "utf8");
if (!source.includes("Context Kit override for @zhafron/mcp-web-search 1.3.0")) process.exit(1);
if (!source.includes("waitForSelector")) process.exit(1);
if (!source.includes("decodeBingRedirect")) process.exit(1);
' >/dev/null 2>&1
}
check_web_search_chrome() {
docker run --rm --entrypoint /usr/bin/test \
"${WEB_SEARCH_IMAGE}" \
-x "${WEB_SEARCH_CHROME_PATH}" >/dev/null 2>&1
}
warn() { warn() {
printf 'warn: %s\n' "$*" >&2 printf 'warn: %s\n' "$*" >&2
} }
@@ -257,9 +314,12 @@ cmd_status() {
printf '\nImages\n' printf '\nImages\n'
docker image ls --format '{{.Repository}}:{{.Tag}}\t{{.Size}}' \ docker image ls --format '{{.Repository}}:{{.Tag}}\t{{.Size}}' \
| grep -E '^(context-kit/|ghcr.io/yamadashy/repomix:)' || true | grep -E '^(context-kit/|ghcr.io/yamadashy/repomix:)' || true
printf '\nLabeled containers\n'
docker ps -a --filter label=dev.context-kit=true --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}'
printf '\nDocs MCP endpoint\n- %s (container: %s)\n' "${DOCS_HTTP_URL}" "${DOCS_CONTAINER_NAME}" printf '\nDocs MCP endpoint\n- %s (container: %s)\n' "${DOCS_HTTP_URL}" "${DOCS_CONTAINER_NAME}"
printf '\nDocs sources\n' printf '\nDocs sources\n'
resolved_sources | sed 's/^/- /' resolved_sources | sed 's/^/- /'
printf '\nLocal docs source directory\n- %s (served inside docs-mcp at http://127.0.0.1:%s/)\n' "${DOCS_LOCAL_SOURCES_DIR}" "${DOCS_LOCAL_SOURCES_PORT}"
printf '\nData directory\n- %s\n' "${DATA_DIR}" printf '\nData directory\n- %s\n' "${DATA_DIR}"
} }
@@ -303,6 +363,27 @@ cmd_doctor() {
fi fi
done done
if docker image inspect "${WEB_SEARCH_IMAGE}" >/dev/null 2>&1; then
if check_web_search_schema_patch; then
printf 'pass web-search fetch_url max-bytes schema patch: %s\n' "${WEB_SEARCH_MAX_BYTES}"
else
printf 'fail web-search max-bytes schema patch missing; run: context-kit build\n'
ok=1
fi
if check_web_search_bing_override; then
printf 'pass web-search Bing provider override installed\n'
else
printf 'fail web-search Bing provider override missing; run: context-kit build\n'
ok=1
fi
if check_web_search_chrome; then
printf 'pass web-search Chromium path: %s\n' "${WEB_SEARCH_CHROME_PATH}"
else
printf 'fail web-search Chromium path unavailable: %s\n' "${WEB_SEARCH_CHROME_PATH}"
ok=1
fi
fi
if command -v curl >/dev/null 2>&1 && curl -fsS "http://127.0.0.1:${SEARXNG_PORT}/healthz" >/dev/null 2>&1; then if command -v curl >/dev/null 2>&1 && curl -fsS "http://127.0.0.1:${SEARXNG_PORT}/healthz" >/dev/null 2>&1; then
printf 'pass SearXNG responds on 127.0.0.1:%s\n' "${SEARXNG_PORT}" printf 'pass SearXNG responds on 127.0.0.1:%s\n' "${SEARXNG_PORT}"
else else
@@ -331,11 +412,14 @@ cmd_web_search() {
exec docker run --rm -i \ exec docker run --rm -i \
--label dev.context-kit=true \ --label dev.context-kit=true \
--network "${NETWORK}" \ --network "${NETWORK}" \
-e DEFAULT_SEARCH_PROVIDER="${DEFAULT_SEARCH_PROVIDER:-searxng}" \ -e DEFAULT_SEARCH_PROVIDER="${WEB_SEARCH_PROVIDER}" \
-e SEARXNG_URL="${SEARXNG_URL:-http://searxng:8080}" \ -e SEARXNG_URL="${SEARXNG_URL:-http://searxng:8080}" \
-e CHROME_PATH="${CHROME_PATH:-/usr/bin/chromium}" \ -e CHROME_PATH="${WEB_SEARCH_CHROME_PATH}" \
-e HTTP_TIMEOUT="${HTTP_TIMEOUT:-15000}" \ -e HTTP_TIMEOUT="${WEB_SEARCH_HTTP_TIMEOUT}" \
-e MAX_RESULTS="${MAX_RESULTS:-10}" \ -e MAX_BYTES="${WEB_SEARCH_MAX_BYTES}" \
-e MAX_RESULTS="${WEB_SEARCH_MAX_RESULTS}" \
-e BROWSER_SEARCH_USER_AGENT="${WEB_SEARCH_BROWSER_USER_AGENT}" \
-e MCP_COMPAT_MODE="${WEB_SEARCH_MCP_COMPAT_MODE}" \
"${WEB_SEARCH_IMAGE}" "${WEB_SEARCH_IMAGE}"
} }
@@ -397,12 +481,13 @@ print_opencode() {
"type": "local", "type": "local",
"command": ["${bin}", "web-search"], "command": ["${bin}", "web-search"],
"enabled": true, "enabled": true,
"timeout": 60000 "timeout": 150000
}, },
"context-docs": { "context-docs": {
"type": "remote", "type": "remote",
"url": "${url}", "url": "${url}",
"enabled": true "enabled": true,
"timeout": 150000
}, },
"context-repomix": { "context-repomix": {
"type": "local", "type": "local",
@@ -451,7 +536,7 @@ cmd_install() {
cmd_redaction_check() { cmd_redaction_check() {
local bad=0 local bad=0
local local_path_terms='/(home|Users)/[^/[:space:]]+|[A-Za-z]:\\Users\\[^\\[:space:]]+' local local_path_terms='/(home|Users)/[^/[:space:]]+|/data/(projects|opencode-mcp)[^[:space:]]*|[A-Za-z]:\\Users\\[^\\[:space:]]+'
local secret_terms='AKIA[0-9A-Z]{16}|BEGIN (RSA |OPENSSH |EC |DSA )?PRIVATE KEY|xox[baprs]-|sk-[A-Za-z0-9_-]{20,}|ghp_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}|glpat-[A-Za-z0-9_-]{20,}|gitea_[A-Za-z0-9_-]{20,}' local secret_terms='AKIA[0-9A-Z]{16}|BEGIN (RSA |OPENSSH |EC |DSA )?PRIVATE KEY|xox[baprs]-|sk-[A-Za-z0-9_-]{20,}|ghp_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}|glpat-[A-Za-z0-9_-]{20,}|gitea_[A-Za-z0-9_-]{20,}'
# Scan only what would be published: skip .git plus everything .gitignore # Scan only what would be published: skip .git plus everything .gitignore

View File

@@ -9,7 +9,6 @@ services:
environment: environment:
BASE_URL: "http://127.0.0.1:${CONTEXT_KIT_SEARXNG_PORT:-8099}/" BASE_URL: "http://127.0.0.1:${CONTEXT_KIT_SEARXNG_PORT:-8099}/"
INSTANCE_NAME: "context-kit-search" INSTANCE_NAME: "context-kit-search"
SEARXNG_SECRET: "${CONTEXT_KIT_SEARXNG_SECRET:-change-me-local-only}"
volumes: volumes:
- ./docker/web-search/searxng/settings.yml:/etc/searxng/settings.yml:ro - ./docker/web-search/searxng/settings.yml:/etc/searxng/settings.yml:ro
- searxng-cache:/var/cache/searxng - searxng-cache:/var/cache/searxng
@@ -19,16 +18,21 @@ services:
web-search-mcp: web-search-mcp:
build: build:
context: ./docker/web-search context: ./docker/web-search
args:
MCP_WEB_SEARCH_MAX_BYTES: "${CONTEXT_KIT_WEB_SEARCH_MAX_BYTES:-52428800}"
image: context-kit/web-search-mcp:latest image: context-kit/web-search-mcp:latest
profiles: ["mcp"] profiles: ["mcp"]
stdin_open: true stdin_open: true
tty: false tty: false
environment: environment:
DEFAULT_SEARCH_PROVIDER: "searxng" DEFAULT_SEARCH_PROVIDER: "${CONTEXT_KIT_WEB_SEARCH_PROVIDER:-searxng}"
SEARXNG_URL: "http://searxng:8080" SEARXNG_URL: "http://searxng:8080"
CHROME_PATH: "/usr/bin/chromium" CHROME_PATH: "${CONTEXT_KIT_WEB_SEARCH_CHROME_PATH:-/usr/bin/chromium}"
HTTP_TIMEOUT: "15000" HTTP_TIMEOUT: "${CONTEXT_KIT_WEB_SEARCH_HTTP_TIMEOUT:-15000}"
MAX_RESULTS: "10" MAX_BYTES: "${CONTEXT_KIT_WEB_SEARCH_MAX_BYTES:-52428800}"
MAX_RESULTS: "${CONTEXT_KIT_WEB_SEARCH_MAX_RESULTS:-10}"
BROWSER_SEARCH_USER_AGENT: "${CONTEXT_KIT_WEB_SEARCH_BROWSER_USER_AGENT:-}"
MCP_COMPAT_MODE: "${CONTEXT_KIT_WEB_SEARCH_MCP_COMPAT_MODE:-}"
labels: labels:
dev.context-kit: "true" dev.context-kit: "true"
@@ -53,6 +57,7 @@ services:
DOCS_MCP_MAX_GET_BYTES: "${CONTEXT_KIT_DOCS_MAX_GET_BYTES:-75000}" DOCS_MCP_MAX_GET_BYTES: "${CONTEXT_KIT_DOCS_MAX_GET_BYTES:-75000}"
DOCS_MCP_EMBED_MODEL: "${CONTEXT_KIT_DOCS_EMBED_MODEL:-BAAI/bge-small-en-v1.5}" DOCS_MCP_EMBED_MODEL: "${CONTEXT_KIT_DOCS_EMBED_MODEL:-BAAI/bge-small-en-v1.5}"
DOCS_MCP_ALLOW_ORIGIN: "${CONTEXT_KIT_DOCS_ALLOW_ORIGIN:-}" DOCS_MCP_ALLOW_ORIGIN: "${CONTEXT_KIT_DOCS_ALLOW_ORIGIN:-}"
DOCS_MCP_LOCAL_SOURCES_PORT: "${CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT:-8769}"
# Preindex on startup is off by default; use the docs_refresh tool to # Preindex on startup is off by default; use the docs_refresh tool to
# refresh on demand. Set CONTEXT_KIT_DOCS_PREINDEX=1 to restore eager. # refresh on demand. Set CONTEXT_KIT_DOCS_PREINDEX=1 to restore eager.
DOCS_MCP_PREINDEX: "${CONTEXT_KIT_DOCS_PREINDEX:-0}" DOCS_MCP_PREINDEX: "${CONTEXT_KIT_DOCS_PREINDEX:-0}"
@@ -60,6 +65,7 @@ services:
- ${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/docs:/data - ${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/docs:/data
- ${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/models:/models - ${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/models:/models
- ${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/docs-sources.txt:/etc/context-kit/docs-sources.txt:ro - ${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/docs-sources.txt:/etc/context-kit/docs-sources.txt:ro
- ${CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR:-${CONTEXT_KIT_DATA_DIR:-${HOME}/.local/share/context-kit}/local-sources}:/etc/context-kit/local-sources:ro
healthcheck: healthcheck:
test: ["CMD-SHELL", "python -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://127.0.0.1:8000/status', timeout=2).status < 500 else 1)\""] test: ["CMD-SHELL", "python -c \"import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://127.0.0.1:8000/status', timeout=2).status < 500 else 1)\""]
interval: 30s interval: 30s

View File

@@ -4,6 +4,5 @@
https://code.claude.com/docs/llms.txt https://code.claude.com/docs/llms.txt
https://developers.openai.com/api/docs/llms.txt https://developers.openai.com/api/docs/llms.txt
https://developers.openai.com/api/reference/llms.txt https://developers.openai.com/api/reference/llms.txt
https://docs.anthropic.com/llms.txt
https://openrouter.ai/docs/llms.txt https://openrouter.ai/docs/llms.txt
https://modelcontextprotocol.io/llms-full.txt https://modelcontextprotocol.io/llms-full.txt

View File

@@ -1,3 +1,4 @@
* *
!Dockerfile !Dockerfile
!entrypoint.sh !entrypoint.sh
!constraints.txt

View File

@@ -1,7 +1,10 @@
FROM python:3.12-slim FROM python:3.12-slim@sha256:6c4dd321d176d61ea848dc8c73a4f7dbae8f70e0ee48bb411ea2f045b599fa8e
ARG LLMS_TXT_MCP_VERSION=0.2.0 ARG LLMS_TXT_MCP_VERSION=0.2.0
ARG MCP_PROXY_VERSION=0.12.0 ARG MCP_PROXY_VERSION=0.12.0
ARG TORCH_VERSION=2.12.1+cpu
COPY constraints.txt /tmp/context-kit-docs-constraints.txt
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
@@ -11,17 +14,19 @@ RUN apt-get update \
# Install CPU-only torch first so llms-txt-mcp does not pull large CUDA wheels. # Install CPU-only torch first so llms-txt-mcp does not pull large CUDA wheels.
RUN pip install --no-cache-dir \ RUN pip install --no-cache-dir \
--index-url https://download.pytorch.org/whl/cpu \ --index-url https://download.pytorch.org/whl/cpu \
torch -c /tmp/context-kit-docs-constraints.txt \
"torch==${TORCH_VERSION}"
# llms-txt-mcp does the indexing/search; mcp-proxy fronts its stdio transport # llms-txt-mcp does the indexing/search; mcp-proxy fronts its stdio transport
# as Streamable HTTP so multiple MCP clients can share one long-lived process # as Streamable HTTP so multiple MCP clients can share one long-lived process
# (and therefore one Chroma DB writer). # (and therefore one Chroma DB writer).
RUN if [ -n "${LLMS_TXT_MCP_VERSION}" ]; then \ RUN if [ -n "${LLMS_TXT_MCP_VERSION}" ]; then \
pip install --no-cache-dir "llms-txt-mcp==${LLMS_TXT_MCP_VERSION}"; \ pip install --no-cache-dir -c /tmp/context-kit-docs-constraints.txt "llms-txt-mcp==${LLMS_TXT_MCP_VERSION}"; \
else \ else \
pip install --no-cache-dir llms-txt-mcp; \ pip install --no-cache-dir -c /tmp/context-kit-docs-constraints.txt llms-txt-mcp; \
fi \ fi \
&& pip install --no-cache-dir "mcp-proxy==${MCP_PROXY_VERSION}" && pip install --no-cache-dir -c /tmp/context-kit-docs-constraints.txt "mcp-proxy==${MCP_PROXY_VERSION}" \
&& rm /tmp/context-kit-docs-constraints.txt
COPY entrypoint.sh /usr/local/bin/docs-mcp-entrypoint COPY entrypoint.sh /usr/local/bin/docs-mcp-entrypoint
RUN chmod +x /usr/local/bin/docs-mcp-entrypoint RUN chmod +x /usr/local/bin/docs-mcp-entrypoint

107
docker/docs/constraints.txt Normal file
View File

@@ -0,0 +1,107 @@
aiohappyeyeballs==2.6.2
aiohttp==3.14.1
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anyio==4.14.1
attrs==26.1.0
bcrypt==5.0.0
build==1.5.0
certifi==2026.6.17
cffi==2.0.0
charset-normalizer==3.4.7
chromadb==1.5.9
click==8.4.2
cryptography==49.0.0
durationpy==0.10
filelock==3.29.0
flatbuffers==25.12.19
frozenlist==1.8.0
fsspec==2026.4.0
googleapis-common-protos==1.75.0
grpcio==1.81.1
h11==0.16.0
hf-xet==1.5.1
httpcore==1.0.9
httptools==0.8.0
httpx==0.28.1
httpx-sse==0.4.3
httpx_auth==0.23.1
huggingface_hub==1.20.1
idna==3.18
importlib_resources==7.1.0
Jinja2==3.1.6
joblib==1.5.3
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
kubernetes==36.0.2
llms-txt-mcp==0.2.0
markdown-it-py==4.2.0
MarkupSafe==3.0.3
mcp==1.28.0
mcp-proxy==0.12.0
mdurl==0.1.2
mmh3==5.2.1
mpmath==1.3.0
multidict==6.7.1
narwhals==2.22.1
networkx==3.6.1
numpy==2.5.0
oauthlib==3.3.1
onnxruntime==1.27.0
opentelemetry-api==1.43.0
opentelemetry-exporter-otlp-proto-common==1.43.0
opentelemetry-exporter-otlp-proto-grpc==1.43.0
opentelemetry-proto==1.43.0
opentelemetry-sdk==1.43.0
opentelemetry-semantic-conventions==0.64b0
orjson==3.11.9
overrides==7.7.0
packaging==26.2
propcache==0.5.2
protobuf==7.35.1
pybase64==1.4.3
pycparser==3.0
pydantic==2.13.4
pydantic-settings==2.14.2
pydantic_core==2.46.4
Pygments==2.20.0
PyJWT==2.13.0
PyPika==0.51.1
pyproject_hooks==1.2.0
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
python-multipart==0.0.32
PyYAML==6.0.3
referencing==0.37.0
regex==2026.5.9
requests==2.34.2
requests-oauthlib==2.0.0
rich==15.0.0
rpds-py==2026.5.1
safetensors==0.8.0
scikit-learn==1.9.0
scipy==1.18.0
sentence-transformers==5.6.0
setuptools==70.2.0
shellingham==1.5.4
six==1.17.0
sse-starlette==3.4.5
starlette==1.3.1
sympy==1.14.0
tenacity==9.1.4
threadpoolctl==3.6.0
tokenizers==0.22.2
torch==2.12.1+cpu
tqdm==4.68.3
transformers==5.12.1
typer==0.25.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.7.0
uvicorn==0.49.0
uvloop==0.22.1
watchfiles==1.2.0
websocket-client==1.9.0
websockets==16.0
yarl==1.24.2

View File

@@ -12,6 +12,8 @@
set -eu set -eu
sources_file="${DOCS_MCP_SOURCES_FILE:-/etc/context-kit/docs-sources.txt}" sources_file="${DOCS_MCP_SOURCES_FILE:-/etc/context-kit/docs-sources.txt}"
local_sources_dir="${DOCS_MCP_LOCAL_SOURCES_DIR:-/etc/context-kit/local-sources}"
local_sources_port="${DOCS_MCP_LOCAL_SOURCES_PORT:-8769}"
if [ ! -r "$sources_file" ]; then if [ ! -r "$sources_file" ]; then
echo "docs-mcp: sources file not readable: $sources_file" >&2 echo "docs-mcp: sources file not readable: $sources_file" >&2
@@ -27,11 +29,41 @@ if [ -z "$sources" ]; then
exit 64 exit 64
fi fi
if [ -d "$local_sources_dir" ]; then
python -m http.server "$local_sources_port" \
--bind 127.0.0.1 \
--directory "$local_sources_dir" \
>/tmp/context-kit-local-sources.log 2>&1 &
local_sources_pid="$!"
if ! python - "$local_sources_port" <<'PY'
import sys
import time
import urllib.request
port = sys.argv[1]
last_error = None
for _ in range(20):
try:
with urllib.request.urlopen(f"http://127.0.0.1:{port}/", timeout=0.5) as response:
if response.status < 500:
raise SystemExit(0)
except Exception as error:
last_error = error
time.sleep(0.1)
raise SystemExit(f"local source server did not become ready: {last_error}")
PY
then
kill "$local_sources_pid" 2>/dev/null || true
echo "docs-mcp: local source server failed on 127.0.0.1:$local_sources_port" >&2
exit 65
fi
fi
# By default llms-txt-mcp 0.2.0 re-embeds every source on launch (the actual # By default llms-txt-mcp 0.2.0 re-embeds every source on launch (the actual
# default is a background preindex, --no-preindex only disables the foreground # default is a background preindex, --no-preindex only disables the foreground
# variant). On a long-lived container that just wastes ~5 min of CPU per # variant). On a long-lived container that wastes CPU per restart, so we disable
# restart, so we disable BOTH and let the caller use `docs_refresh` on demand. # BOTH. Missing/stale sources still refresh on first docs_query/docs_refresh.
# Set DOCS_MCP_PREINDEX=1 to restore the eager behavior. # Set DOCS_MCP_PREINDEX=1 to restore eager startup indexing.
preindex_flag="--no-preindex --no-background-preindex" preindex_flag="--no-preindex --no-background-preindex"
if [ "${DOCS_MCP_PREINDEX:-0}" = "1" ]; then if [ "${DOCS_MCP_PREINDEX:-0}" = "1" ]; then
preindex_flag="" preindex_flag=""

View File

@@ -1,2 +1,5 @@
* *
!Dockerfile !Dockerfile
!patch-mcp-web-search.mjs
!overrides/
!overrides/bing.js

View File

@@ -1,7 +1,14 @@
FROM node:22-bookworm-slim FROM node:22-bookworm-slim@sha256:813a7480f28fdadac1f7f5c824bcdad435b5bc1322a5968bbbdef8d058f9dff4
ARG MCP_WEB_SEARCH_VERSION=1.3.0 ARG MCP_WEB_SEARCH_VERSION=1.3.0
ARG MCP_WEB_SEARCH_MAX_BYTES=52428800
COPY patch-mcp-web-search.mjs /tmp/patch-mcp-web-search.mjs
COPY overrides/bing.js /tmp/context-kit-bing-provider.js
# Chromium intentionally tracks Debian security updates inside the pinned base
# image family; Bing's browser path is more likely to break with stale Chromium
# than with patched OS packages.
RUN apt-get update \ RUN apt-get update \
&& apt-get install -y --no-install-recommends \ && apt-get install -y --no-install-recommends \
ca-certificates \ ca-certificates \
@@ -10,11 +17,15 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN npm install -g "@zhafron/mcp-web-search@${MCP_WEB_SEARCH_VERSION}" \ RUN npm install -g "@zhafron/mcp-web-search@${MCP_WEB_SEARCH_VERSION}" \
&& cp /tmp/context-kit-bing-provider.js /usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/providers/bing.js \
&& node /tmp/patch-mcp-web-search.mjs \
&& rm /tmp/patch-mcp-web-search.mjs /tmp/context-kit-bing-provider.js \
&& npm cache clean --force && npm cache clean --force
ENV CHROME_PATH=/usr/bin/chromium \ ENV CHROME_PATH=/usr/bin/chromium \
DEFAULT_SEARCH_PROVIDER=searxng \ DEFAULT_SEARCH_PROVIDER=searxng \
HTTP_TIMEOUT=15000 \ HTTP_TIMEOUT=15000 \
MAX_BYTES=${MCP_WEB_SEARCH_MAX_BYTES} \
MAX_RESULTS=10 \ MAX_RESULTS=10 \
SEARXNG_URL=http://searxng:8080 SEARXNG_URL=http://searxng:8080

View File

@@ -0,0 +1,114 @@
import { PUPPETEER_TIMEOUT } from "../constants.js";
import { browserPool } from "../utils/browser-pool.js";
import { getAcceptLanguageHeader, getMarketFromLang } from "../utils/user-agent.js";
import { searchCache, createCacheKey } from "../utils/cache.js";
// Context Kit override for @zhafron/mcp-web-search 1.3.0.
// The upstream provider can read Bing before result cards render and return an
// empty fallback. Keep this as a direct provider replacement until upstream
// waits for cards and decodes current /ck/a redirects reliably.
const DEFAULT_BROWSER_SEARCH_USER_AGENT = process.env.BROWSER_SEARCH_USER_AGENT ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36";
function decodeBase64Url(value) {
const normalized = value.replace(/-/g, "+").replace(/_/g, "/");
const padded = normalized.padEnd(normalized.length + ((4 - normalized.length % 4) % 4), "=");
return Buffer.from(padded, "base64").toString("utf-8");
}
export class BingProvider {
name = "bing";
decodeBingRedirect(href) {
try {
const url = new URL(href, "https://www.bing.com/");
if (url.hostname === "www.bing.com" && url.pathname === "/ck/a") {
const encoded = url.searchParams.get("u");
if (encoded) {
const candidates = [encoded];
if (/^[a-z][0-9]/i.test(encoded)) candidates.push(encoded.slice(2));
for (const candidate of candidates) {
try {
const decoded = decodeBase64Url(candidate);
if (/^https?:\/\//i.test(decoded)) return decoded;
}
catch { }
}
}
}
return url.toString();
}
catch {
return href;
}
}
async search(q, limit, lang) {
const cacheKey = createCacheKey("bing", q, limit, lang);
const cached = searchCache.get(cacheKey);
if (cached)
return cached;
const market = getMarketFromLang(lang);
const results = await browserPool.withBrowser(async (browser) => {
const page = await browser.newPage();
try {
await page.setViewport({ width: 1365, height: 768 });
await page.setUserAgent(DEFAULT_BROWSER_SEARCH_USER_AGENT);
await page.setExtraHTTPHeaders(getAcceptLanguageHeader(lang));
const url = new URL("https://www.bing.com/search");
url.searchParams.set("q", q);
url.searchParams.set("mkt", market);
const response = await page.goto(url.toString(), {
waitUntil: "domcontentloaded",
timeout: PUPPETEER_TIMEOUT
});
if (response && response.status() >= 400) {
throw new Error(`Bing HTTP ${response.status()}`);
}
await page.waitForSelector("li.b_algo h2 a[href], li.b_algo a[href]", { timeout: 10000 }).catch(() => undefined);
const items = await page.evaluate(maxResults => {
const parsed = [];
for (const card of Array.from(document.querySelectorAll("li.b_algo"))) {
const anchor = card.querySelector("h2 a[href]") || card.querySelector("a[href]");
const title = anchor?.textContent?.trim() || "";
const href = anchor?.getAttribute("href") || "";
if (!title || !href)
continue;
const snippetElement = card.querySelector("div.b_caption p, div.b_snippet, p");
const snippet = snippetElement?.textContent?.trim() || undefined;
parsed.push({ title, url: href, snippet });
if (parsed.length >= maxResults)
break;
}
return parsed;
}, limit);
return items.flatMap(result => {
try {
const absolute = new URL(result.url, "https://www.bing.com/").toString();
const decoded = this.decodeBingRedirect(absolute);
new URL(decoded);
return [{ ...result, url: decoded, source: "bing" }];
}
catch {
return [];
}
});
}
finally {
await page.close();
}
});
searchCache.set(cacheKey, results);
return results;
}
async isAvailable() {
try {
await browserPool.getBrowser();
return true;
}
catch {
return false;
}
}
}

View File

@@ -0,0 +1,28 @@
import fs from "node:fs";
// Context Kit patch for @zhafron/mcp-web-search 1.3.0.
// Upstream hard-codes the fetch_url schema limit to 25 MiB even though the
// runtime extractor already uses MAX_BYTES. Keep this narrow and fail the build
// if upstream changes the compiled source shape.
const serverPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/server.js";
let source = fs.readFileSync(serverPath, "utf8");
const replacements = [
[
'import { MAX_RESULTS } from "./constants.js";',
'import { MAX_BYTES, MAX_RESULTS } from "./constants.js";'
],
[
"max_download_bytes: z.number().int().min(1).max(26214400).optional()",
"max_download_bytes: z.number().int().min(1).max(MAX_BYTES).optional()"
]
];
for (const [before, after] of replacements) {
if (!source.includes(before)) {
throw new Error(`mcp-web-search patch target not found: ${before}`);
}
source = source.replace(before, after);
}
fs.writeFileSync(serverPath, source);

View File

@@ -15,8 +15,8 @@ search:
- json - json
server: server:
# Local placeholder. The Docker service also sets SEARXNG_SECRET from .env; # Local placeholder. Keep SearXNG bound to 127.0.0.1 unless you review this
# keep SearXNG bound to 127.0.0.1 unless you review this config separately. # config and replace the secret_key for a deliberate non-local deployment.
secret_key: "local-only-change-if-exposed" secret_key: "local-only-change-if-exposed"
limiter: false limiter: false
image_proxy: true image_proxy: true

View File

@@ -14,6 +14,13 @@ shell code.
| `CONTEXT_KIT_DATA_DIR` | `$HOME/.local/share/context-kit` | Persistent docs indexes and model cache | | `CONTEXT_KIT_DATA_DIR` | `$HOME/.local/share/context-kit` | Persistent docs indexes and model cache |
| `CONTEXT_KIT_COMPOSE_PROJECT` | `context-kit` | Docker Compose project and network prefix | | `CONTEXT_KIT_COMPOSE_PROJECT` | `context-kit` | Docker Compose project and network prefix |
| `CONTEXT_KIT_SEARXNG_PORT` | `8099` | Localhost SearXNG port | | `CONTEXT_KIT_SEARXNG_PORT` | `8099` | Localhost SearXNG port |
| `CONTEXT_KIT_WEB_SEARCH_MAX_BYTES` | `52428800` | Max bytes `context-web-search` accepts and downloads per fetch |
| `CONTEXT_KIT_WEB_SEARCH_PROVIDER` | `searxng` | Default `search_web` provider; fallback order depends on this provider |
| `CONTEXT_KIT_WEB_SEARCH_HTTP_TIMEOUT` | `15000` | HTTP timeout in milliseconds for search providers |
| `CONTEXT_KIT_WEB_SEARCH_MAX_RESULTS` | `10` | Default search result count when clients omit `limit` |
| `CONTEXT_KIT_WEB_SEARCH_CHROME_PATH` | `/usr/bin/chromium` | Chromium path inside the web-search image for Bing fallback |
| `CONTEXT_KIT_WEB_SEARCH_BROWSER_USER_AGENT` | bundled Chrome/Linux UA | User agent for the Chromium-backed Bing fallback |
| `CONTEXT_KIT_WEB_SEARCH_MCP_COMPAT_MODE` | unset | Set to `legacy` for MCP clients with weak tool-schema parsers |
| `CONTEXT_KIT_DOCS_PORT` | `8776` | Localhost port for the long-lived docs-mcp HTTP service | | `CONTEXT_KIT_DOCS_PORT` | `8776` | Localhost port for the long-lived docs-mcp HTTP service |
| `CONTEXT_KIT_DOCS_HTTP_URL` | `http://127.0.0.1:${CONTEXT_KIT_DOCS_PORT}/mcp` | URL emitted into HTTP MCP install snippets | | `CONTEXT_KIT_DOCS_HTTP_URL` | `http://127.0.0.1:${CONTEXT_KIT_DOCS_PORT}/mcp` | URL emitted into HTTP MCP install snippets |
| `CONTEXT_KIT_DOCS_ALLOW_ORIGIN` | unset | Optional exact browser CORS origin(s) for docs-mcp, separated by spaces | | `CONTEXT_KIT_DOCS_ALLOW_ORIGIN` | unset | Optional exact browser CORS origin(s) for docs-mcp, separated by spaces |
@@ -22,6 +29,8 @@ shell code.
| `CONTEXT_KIT_DOCS_MAX_GET_BYTES` | `75000` | Max bytes returned by docs retrieval | | `CONTEXT_KIT_DOCS_MAX_GET_BYTES` | `75000` | Max bytes returned by docs retrieval |
| `CONTEXT_KIT_DOCS_EMBED_MODEL` | `BAAI/bge-small-en-v1.5` | SentenceTransformers embedding model | | `CONTEXT_KIT_DOCS_EMBED_MODEL` | `BAAI/bge-small-en-v1.5` | SentenceTransformers embedding model |
| `CONTEXT_KIT_DOCS_PREINDEX` | `0` | Set to `1` to re-embed every source on container start | | `CONTEXT_KIT_DOCS_PREINDEX` | `0` | Set to `1` to re-embed every source on container start |
| `CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR` | `${CONTEXT_KIT_DATA_DIR}/local-sources` | Machine-local llms.txt tree mounted read-only into docs-mcp |
| `CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT` | `8769` | Loopback port inside docs-mcp for serving local source files |
## TTL Guidance ## TTL Guidance
@@ -66,3 +75,8 @@ CONTEXT_KIT_DOCS_SOURCES="config/sources.default.txt config/sources.js.txt"
``` ```
Each source file is plain text. Blank lines and `#` comments are ignored. Each source file is plain text. Blank lines and `#` comments are ignored.
Entries may be absolute source-profile paths for private machine-local config.
For local llms.txt files, place content under
`CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR` and reference it as
`http://127.0.0.1:8769/path/inside/local-sources.txt`; that loopback URL is
inside the docs-mcp container, not exposed on the host.

View File

@@ -33,6 +33,37 @@ Build default images:
bin/context-kit build bin/context-kit build
``` ```
## Fetch URL Says Max Download Bytes Is Too Big
If `fetch_url` fails before making a network request with an MCP validation error
like `Number must be less than or equal to 26214400`, rebuild the web-search MCP
image:
```sh
bin/context-kit build
```
Context Kit patches the upstream `mcp-web-search` schema so the accepted
`max_download_bytes` value matches `CONTEXT_KIT_WEB_SEARCH_MAX_BYTES`, which
defaults to `52428800`.
## Search Fallback and Chromium
`search_web` defaults to SearXNG. If SearXNG fails or returns no results, the
upstream fallback order is DuckDuckGo, then Bing. Bing uses Chromium through
Puppeteer, so `bin/context-kit doctor` checks that the configured Chromium path
exists inside the web-search image.
Context Kit carries a source-controlled Bing provider override in
`docker/web-search/overrides/bing.js` because the upstream 1.3.0 provider can
race result rendering and return no items even when Chromium sees Bing result
cards. The override waits for result cards and decodes current Bing redirect
URLs before handing results back to the upstream fallback registry.
`fetch_url` is different: in upstream `mcp-web-search` 1.3.0, `engine=browser` is
accepted but reserved for future support. It does not currently invoke Chromium;
URL fetching uses the HTTP extractor path.
## Docs Indexing Is Slow ## Docs Indexing Is Slow
The first run downloads an embedding model and embeds every configured docs The first run downloads an embedding model and embeds every configured docs

49
scripts/release-check Executable file
View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "${ROOT}"
tmp_dir="$(mktemp -d)"
cleanup() {
rm -rf "${tmp_dir}"
}
trap cleanup EXIT
check_node() {
local file
for file in "$@"; do
node --check "${file}"
done
}
git diff --check
git ls-files --cached --error-unmatch \
docker/web-search/patch-mcp-web-search.mjs \
docker/web-search/overrides/bing.js \
docker/docs/constraints.txt \
scripts/smoke-web-search.mjs \
scripts/release-check >/dev/null
bash -n bin/context-kit
bash -n scripts/release-check
sh -n docker/docs/entrypoint.sh
check_node docker/web-search/patch-mcp-web-search.mjs docker/web-search/overrides/bing.js scripts/smoke-web-search.mjs
node -e 'const fs=require("node:fs"); JSON.parse(fs.readFileSync("snippets/opencode.json", "utf8")); JSON.parse(fs.readFileSync("snippets/claude.mcp.json", "utf8"));'
bin/context-kit install opencode > "${tmp_dir}/opencode.json"
bin/context-kit install opencode --absolute > "${tmp_dir}/opencode-absolute.json"
bin/context-kit install claude > "${tmp_dir}/claude.json"
bin/context-kit install claude --absolute > "${tmp_dir}/claude-absolute.json"
node -e 'const fs=require("node:fs"); for (const file of process.argv.slice(1)) JSON.parse(fs.readFileSync(file, "utf8"));' \
"${tmp_dir}/opencode.json" \
"${tmp_dir}/opencode-absolute.json" \
"${tmp_dir}/claude.json" \
"${tmp_dir}/claude-absolute.json"
bin/context-kit redaction-check
docker compose -p context-kit -f compose.yml config >/dev/null
bin/context-kit build
bin/context-kit doctor
node scripts/smoke-web-search.mjs bin/context-kit web-search
printf 'pass release-check\n'

View File

@@ -0,0 +1,152 @@
import { spawn } from "node:child_process";
const command = process.argv[2];
const args = process.argv.slice(3);
if (!command) {
throw new Error("usage: node scripts/smoke-web-search.mjs <command> [args...]");
}
const child = spawn(command, args, {
cwd: new URL("..", import.meta.url).pathname,
env: process.env,
stdio: ["pipe", "pipe", "pipe"]
});
let nextId = 1;
const pending = new Map();
let stdoutBuffer = "";
let stderrBuffer = "";
function stopChild() {
child.stdin.end();
child.kill("SIGTERM");
const killTimer = setTimeout(() => child.kill("SIGKILL"), 3000);
return new Promise(resolve => {
child.once("exit", () => {
clearTimeout(killTimer);
resolve();
});
});
}
const timeout = setTimeout(async () => {
await stopChild();
console.error(`MCP smoke timed out. stderr: ${stderrBuffer.slice(-2000)}`);
process.exit(1);
}, 120000);
child.stderr.on("data", chunk => {
stderrBuffer += chunk.toString();
});
child.stdout.on("data", chunk => {
stdoutBuffer += chunk.toString();
let newline;
while ((newline = stdoutBuffer.indexOf("\n")) >= 0) {
const line = stdoutBuffer.slice(0, newline).trim();
stdoutBuffer = stdoutBuffer.slice(newline + 1);
if (!line) continue;
let message;
try {
message = JSON.parse(line);
} catch {
continue;
}
if (message.id && pending.has(message.id)) {
const { resolve, reject } = pending.get(message.id);
pending.delete(message.id);
if (message.error) reject(new Error(JSON.stringify(message.error)));
else resolve(message.result);
}
}
});
function request(method, params = {}) {
const id = nextId++;
child.stdin.write(`${JSON.stringify({ jsonrpc: "2.0", id, method, params })}\n`);
return new Promise((resolve, reject) => pending.set(id, { resolve, reject }));
}
function notify(method, params = {}) {
child.stdin.write(`${JSON.stringify({ jsonrpc: "2.0", method, params })}\n`);
}
function textFrom(result) {
return (result.content || [])
.filter(part => part.type === "text")
.map(part => part.text)
.join("\n");
}
async function callTool(name, args = {}) {
return request("tools/call", { name, arguments: args });
}
try {
await request("initialize", {
protocolVersion: "2024-11-05",
capabilities: {},
clientInfo: { name: "context-kit-smoke", version: "0.0.0" }
});
notify("notifications/initialized");
const listed = await request("tools/list");
const toolNames = new Set((listed.tools || []).map(tool => tool.name));
for (const name of ["search_web", "fetch_url"]) {
if (!toolNames.has(name)) throw new Error(`missing tool: ${name}`);
}
const searxng = textFrom(await callTool("search_web", {
q: "Model Context Protocol",
limit: 2,
provider: "searxng"
}));
if (!searxng.includes("Model")) throw new Error(`SearXNG smoke returned unexpected text: ${searxng.slice(0, 500)}`);
const bing = textFrom(await callTool("search_web", {
q: "Model Context Protocol",
limit: 2,
provider: "bing"
}));
if (!bing.includes("Model")) throw new Error(`Bing smoke returned unexpected text: ${bing.slice(0, 500)}`);
const fetch = textFrom(await callTool("fetch_url", {
url: "https://example.com/",
format: "markdown",
max_download_bytes: 52428800
}));
if (!fetch.includes("Example Domain")) throw new Error(`fetch smoke returned unexpected text: ${fetch.slice(0, 500)}`);
const browserFetch = textFrom(await callTool("fetch_url", {
url: "https://example.com/",
format: "markdown",
engine: "browser",
max_download_bytes: 52428800
}));
if (!browserFetch.includes("Example Domain")) throw new Error(`browser fetch smoke returned unexpected text: ${browserFetch.slice(0, 500)}`);
const localResult = await callTool("fetch_url", {
url: "http://127.0.0.1:1/",
max_download_bytes: 52428800
});
const localBlocked = Boolean(localResult.isError) && textFrom(localResult).includes("Blocked localhost/private URL");
if (!localBlocked) throw new Error("localhost/private URL was not blocked as expected");
clearTimeout(timeout);
await stopChild();
console.log(JSON.stringify({
tools: Array.from(toolNames).sort(),
searxng: "pass",
bing: "pass",
fetch_url: "pass",
fetch_url_browser_engine_currently_http: "pass",
localhost_guard: "pass"
}, null, 2));
} catch (error) {
clearTimeout(timeout);
await stopChild();
console.error(error.message);
if (stderrBuffer) console.error(stderrBuffer.slice(-4000));
process.exit(1);
}

View File

@@ -5,12 +5,13 @@
"type": "local", "type": "local",
"command": ["context-kit", "web-search"], "command": ["context-kit", "web-search"],
"enabled": true, "enabled": true,
"timeout": 60000 "timeout": 150000
}, },
"context-docs": { "context-docs": {
"type": "remote", "type": "remote",
"url": "http://127.0.0.1:8776/mcp", "url": "http://127.0.0.1:8776/mcp",
"enabled": true "enabled": true,
"timeout": 150000
}, },
"context-repomix": { "context-repomix": {
"type": "local", "type": "local",