Harden web search and docs defaults

This commit is contained in:
2026-06-24 23:57:44 -07:00
parent 8fcd94d2c5
commit 8237f1331c
19 changed files with 691 additions and 35 deletions

View File

@@ -1,7 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
SCRIPT_PATH="${BASH_SOURCE[0]}"
while [[ -L "${SCRIPT_PATH}" ]]; do
SCRIPT_DIR="$(cd -P "$(dirname "${SCRIPT_PATH}")" && pwd)"
SCRIPT_TARGET="$(readlink "${SCRIPT_PATH}")"
if [[ "${SCRIPT_TARGET}" = /* ]]; then
SCRIPT_PATH="${SCRIPT_TARGET}"
else
SCRIPT_PATH="${SCRIPT_DIR}/${SCRIPT_TARGET}"
fi
done
ROOT="$(cd -P "$(dirname "${SCRIPT_PATH}")/.." && pwd)"
ENV_FILE="${ROOT}/.env"
load_env_file() {
@@ -39,10 +49,19 @@ NETWORK="${CONTEXT_KIT_DOCKER_NETWORK:-${PROJECT}_default}"
SEARXNG_PORT="${CONTEXT_KIT_SEARXNG_PORT:-8099}"
DOCS_PORT="${CONTEXT_KIT_DOCS_PORT:-8776}"
DOCS_HTTP_URL="${CONTEXT_KIT_DOCS_HTTP_URL:-http://127.0.0.1:${DOCS_PORT}/mcp}"
WEB_SEARCH_MAX_BYTES="${CONTEXT_KIT_WEB_SEARCH_MAX_BYTES:-52428800}"
WEB_SEARCH_PROVIDER="${CONTEXT_KIT_WEB_SEARCH_PROVIDER:-${DEFAULT_SEARCH_PROVIDER:-searxng}}"
WEB_SEARCH_HTTP_TIMEOUT="${CONTEXT_KIT_WEB_SEARCH_HTTP_TIMEOUT:-${HTTP_TIMEOUT:-15000}}"
WEB_SEARCH_MAX_RESULTS="${CONTEXT_KIT_WEB_SEARCH_MAX_RESULTS:-${MAX_RESULTS:-10}}"
WEB_SEARCH_CHROME_PATH="${CONTEXT_KIT_WEB_SEARCH_CHROME_PATH:-${CHROME_PATH:-/usr/bin/chromium}}"
WEB_SEARCH_BROWSER_USER_AGENT="${CONTEXT_KIT_WEB_SEARCH_BROWSER_USER_AGENT:-${BROWSER_SEARCH_USER_AGENT:-}}"
WEB_SEARCH_MCP_COMPAT_MODE="${CONTEXT_KIT_WEB_SEARCH_MCP_COMPAT_MODE:-${MCP_COMPAT_MODE:-}}"
DOCS_CONTAINER_NAME="context-kit-docs-mcp"
DOCS_SOURCES_FILE="${DATA_DIR}/docs-sources.txt"
DOCS_DATA_DIR="${DATA_DIR}/docs"
MODELS_DATA_DIR="${DATA_DIR}/models"
DOCS_LOCAL_SOURCES_DIR="${CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR:-${DATA_DIR}/local-sources}"
DOCS_LOCAL_SOURCES_PORT="${CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT:-8769}"
WEB_SEARCH_IMAGE="${CONTEXT_KIT_WEB_SEARCH_IMAGE:-context-kit/web-search-mcp:latest}"
DOCS_IMAGE="${CONTEXT_KIT_DOCS_IMAGE:-context-kit/docs-mcp:latest}"
@@ -86,6 +105,8 @@ compose() {
CONTEXT_KIT_DOCS_MAX_GET_BYTES="${CONTEXT_KIT_DOCS_MAX_GET_BYTES:-75000}" \
CONTEXT_KIT_DOCS_EMBED_MODEL="${CONTEXT_KIT_DOCS_EMBED_MODEL:-BAAI/bge-small-en-v1.5}" \
CONTEXT_KIT_DOCS_PREINDEX="${CONTEXT_KIT_DOCS_PREINDEX:-0}" \
CONTEXT_KIT_DOCS_LOCAL_SOURCES_DIR="${DOCS_LOCAL_SOURCES_DIR}" \
CONTEXT_KIT_DOCS_LOCAL_SOURCES_PORT="${DOCS_LOCAL_SOURCES_PORT}" \
BUILDX_BUILDER="${CONTEXT_KIT_BUILDX_BUILDER:-${BUILDX_BUILDER:-default}}" \
docker compose -p "${PROJECT}" -f "${COMPOSE_FILE}" "$@"
}
@@ -112,11 +133,12 @@ prepare_data_dirs() {
ensure_writable_dir "${DATA_DIR}"
ensure_writable_dir "${DOCS_DATA_DIR}"
ensure_writable_dir "${MODELS_DATA_DIR}"
ensure_writable_dir "${DOCS_LOCAL_SOURCES_DIR}"
}
check_data_dirs() {
local ok=0 dir
for dir in "${DATA_DIR}" "${DOCS_DATA_DIR}" "${MODELS_DATA_DIR}"; do
for dir in "${DATA_DIR}" "${DOCS_DATA_DIR}" "${MODELS_DATA_DIR}" "${DOCS_LOCAL_SOURCES_DIR}"; do
if [[ ! -d "${dir}" ]]; then
printf 'warn data directory missing: %s (run context-kit start)\n' "${dir}"
elif [[ -w "${dir}" && -x "${dir}" ]]; then
@@ -129,6 +151,41 @@ check_data_dirs() {
return "${ok}"
}
check_web_search_schema_patch() {
docker run --rm --entrypoint node \
-e MAX_BYTES="${WEB_SEARCH_MAX_BYTES}" \
-e EXPECTED_MAX_BYTES="${WEB_SEARCH_MAX_BYTES}" \
"${WEB_SEARCH_IMAGE}" \
-e '
const fs = require("node:fs");
const expected = Number(process.env.EXPECTED_MAX_BYTES) || 0;
const actual = Number(process.env.MAX_BYTES) || 0;
const serverPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/server.js";
const source = fs.readFileSync(serverPath, "utf8");
if (actual !== expected) process.exit(1);
if (!source.includes("max_download_bytes: z.number().int().min(1).max(MAX_BYTES).optional()")) process.exit(1);
' >/dev/null 2>&1
}
check_web_search_bing_override() {
docker run --rm --entrypoint node \
"${WEB_SEARCH_IMAGE}" \
-e '
const fs = require("node:fs");
const bingPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/providers/bing.js";
const source = fs.readFileSync(bingPath, "utf8");
if (!source.includes("Context Kit override for @zhafron/mcp-web-search 1.3.0")) process.exit(1);
if (!source.includes("waitForSelector")) process.exit(1);
if (!source.includes("decodeBingRedirect")) process.exit(1);
' >/dev/null 2>&1
}
check_web_search_chrome() {
docker run --rm --entrypoint /usr/bin/test \
"${WEB_SEARCH_IMAGE}" \
-x "${WEB_SEARCH_CHROME_PATH}" >/dev/null 2>&1
}
warn() {
printf 'warn: %s\n' "$*" >&2
}
@@ -257,9 +314,12 @@ cmd_status() {
printf '\nImages\n'
docker image ls --format '{{.Repository}}:{{.Tag}}\t{{.Size}}' \
| grep -E '^(context-kit/|ghcr.io/yamadashy/repomix:)' || true
printf '\nLabeled containers\n'
docker ps -a --filter label=dev.context-kit=true --format 'table {{.Names}}\t{{.Status}}\t{{.Image}}'
printf '\nDocs MCP endpoint\n- %s (container: %s)\n' "${DOCS_HTTP_URL}" "${DOCS_CONTAINER_NAME}"
printf '\nDocs sources\n'
resolved_sources | sed 's/^/- /'
printf '\nLocal docs source directory\n- %s (served inside docs-mcp at http://127.0.0.1:%s/)\n' "${DOCS_LOCAL_SOURCES_DIR}" "${DOCS_LOCAL_SOURCES_PORT}"
printf '\nData directory\n- %s\n' "${DATA_DIR}"
}
@@ -303,6 +363,27 @@ cmd_doctor() {
fi
done
if docker image inspect "${WEB_SEARCH_IMAGE}" >/dev/null 2>&1; then
if check_web_search_schema_patch; then
printf 'pass web-search fetch_url max-bytes schema patch: %s\n' "${WEB_SEARCH_MAX_BYTES}"
else
printf 'fail web-search max-bytes schema patch missing; run: context-kit build\n'
ok=1
fi
if check_web_search_bing_override; then
printf 'pass web-search Bing provider override installed\n'
else
printf 'fail web-search Bing provider override missing; run: context-kit build\n'
ok=1
fi
if check_web_search_chrome; then
printf 'pass web-search Chromium path: %s\n' "${WEB_SEARCH_CHROME_PATH}"
else
printf 'fail web-search Chromium path unavailable: %s\n' "${WEB_SEARCH_CHROME_PATH}"
ok=1
fi
fi
if command -v curl >/dev/null 2>&1 && curl -fsS "http://127.0.0.1:${SEARXNG_PORT}/healthz" >/dev/null 2>&1; then
printf 'pass SearXNG responds on 127.0.0.1:%s\n' "${SEARXNG_PORT}"
else
@@ -331,11 +412,14 @@ cmd_web_search() {
exec docker run --rm -i \
--label dev.context-kit=true \
--network "${NETWORK}" \
-e DEFAULT_SEARCH_PROVIDER="${DEFAULT_SEARCH_PROVIDER:-searxng}" \
-e DEFAULT_SEARCH_PROVIDER="${WEB_SEARCH_PROVIDER}" \
-e SEARXNG_URL="${SEARXNG_URL:-http://searxng:8080}" \
-e CHROME_PATH="${CHROME_PATH:-/usr/bin/chromium}" \
-e HTTP_TIMEOUT="${HTTP_TIMEOUT:-15000}" \
-e MAX_RESULTS="${MAX_RESULTS:-10}" \
-e CHROME_PATH="${WEB_SEARCH_CHROME_PATH}" \
-e HTTP_TIMEOUT="${WEB_SEARCH_HTTP_TIMEOUT}" \
-e MAX_BYTES="${WEB_SEARCH_MAX_BYTES}" \
-e MAX_RESULTS="${WEB_SEARCH_MAX_RESULTS}" \
-e BROWSER_SEARCH_USER_AGENT="${WEB_SEARCH_BROWSER_USER_AGENT}" \
-e MCP_COMPAT_MODE="${WEB_SEARCH_MCP_COMPAT_MODE}" \
"${WEB_SEARCH_IMAGE}"
}
@@ -397,12 +481,13 @@ print_opencode() {
"type": "local",
"command": ["${bin}", "web-search"],
"enabled": true,
"timeout": 60000
"timeout": 150000
},
"context-docs": {
"type": "remote",
"url": "${url}",
"enabled": true
"enabled": true,
"timeout": 150000
},
"context-repomix": {
"type": "local",
@@ -451,7 +536,7 @@ cmd_install() {
cmd_redaction_check() {
local bad=0
local local_path_terms='/(home|Users)/[^/[:space:]]+|[A-Za-z]:\\Users\\[^\\[:space:]]+'
local local_path_terms='/(home|Users)/[^/[:space:]]+|/data/(projects|opencode-mcp)[^[:space:]]*|[A-Za-z]:\\Users\\[^\\[:space:]]+'
local secret_terms='AKIA[0-9A-Z]{16}|BEGIN (RSA |OPENSSH |EC |DSA )?PRIVATE KEY|xox[baprs]-|sk-[A-Za-z0-9_-]{20,}|ghp_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}|glpat-[A-Za-z0-9_-]{20,}|gitea_[A-Za-z0-9_-]{20,}'
# Scan only what would be published: skip .git plus everything .gitignore