Harden web search and docs defaults
This commit is contained in:
@@ -1,2 +1,5 @@
|
||||
*
|
||||
!Dockerfile
|
||||
!patch-mcp-web-search.mjs
|
||||
!overrides/
|
||||
!overrides/bing.js
|
||||
|
||||
@@ -1,7 +1,14 @@
|
||||
FROM node:22-bookworm-slim
|
||||
FROM node:22-bookworm-slim@sha256:813a7480f28fdadac1f7f5c824bcdad435b5bc1322a5968bbbdef8d058f9dff4
|
||||
|
||||
ARG MCP_WEB_SEARCH_VERSION=1.3.0
|
||||
ARG MCP_WEB_SEARCH_MAX_BYTES=52428800
|
||||
|
||||
COPY patch-mcp-web-search.mjs /tmp/patch-mcp-web-search.mjs
|
||||
COPY overrides/bing.js /tmp/context-kit-bing-provider.js
|
||||
|
||||
# Chromium intentionally tracks Debian security updates inside the pinned base
|
||||
# image family; Bing's browser path is more likely to break with stale Chromium
|
||||
# than with patched OS packages.
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
@@ -10,11 +17,15 @@ RUN apt-get update \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN npm install -g "@zhafron/mcp-web-search@${MCP_WEB_SEARCH_VERSION}" \
|
||||
&& cp /tmp/context-kit-bing-provider.js /usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/providers/bing.js \
|
||||
&& node /tmp/patch-mcp-web-search.mjs \
|
||||
&& rm /tmp/patch-mcp-web-search.mjs /tmp/context-kit-bing-provider.js \
|
||||
&& npm cache clean --force
|
||||
|
||||
ENV CHROME_PATH=/usr/bin/chromium \
|
||||
DEFAULT_SEARCH_PROVIDER=searxng \
|
||||
HTTP_TIMEOUT=15000 \
|
||||
MAX_BYTES=${MCP_WEB_SEARCH_MAX_BYTES} \
|
||||
MAX_RESULTS=10 \
|
||||
SEARXNG_URL=http://searxng:8080
|
||||
|
||||
|
||||
114
docker/web-search/overrides/bing.js
Normal file
114
docker/web-search/overrides/bing.js
Normal file
@@ -0,0 +1,114 @@
|
||||
import { PUPPETEER_TIMEOUT } from "../constants.js";
|
||||
import { browserPool } from "../utils/browser-pool.js";
|
||||
import { getAcceptLanguageHeader, getMarketFromLang } from "../utils/user-agent.js";
|
||||
import { searchCache, createCacheKey } from "../utils/cache.js";
|
||||
|
||||
// Context Kit override for @zhafron/mcp-web-search 1.3.0.
|
||||
// The upstream provider can read Bing before result cards render and return an
|
||||
// empty fallback. Keep this as a direct provider replacement until upstream
|
||||
// waits for cards and decodes current /ck/a redirects reliably.
|
||||
const DEFAULT_BROWSER_SEARCH_USER_AGENT = process.env.BROWSER_SEARCH_USER_AGENT ||
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36";
|
||||
|
||||
function decodeBase64Url(value) {
|
||||
const normalized = value.replace(/-/g, "+").replace(/_/g, "/");
|
||||
const padded = normalized.padEnd(normalized.length + ((4 - normalized.length % 4) % 4), "=");
|
||||
return Buffer.from(padded, "base64").toString("utf-8");
|
||||
}
|
||||
|
||||
export class BingProvider {
|
||||
name = "bing";
|
||||
|
||||
decodeBingRedirect(href) {
|
||||
try {
|
||||
const url = new URL(href, "https://www.bing.com/");
|
||||
if (url.hostname === "www.bing.com" && url.pathname === "/ck/a") {
|
||||
const encoded = url.searchParams.get("u");
|
||||
if (encoded) {
|
||||
const candidates = [encoded];
|
||||
if (/^[a-z][0-9]/i.test(encoded)) candidates.push(encoded.slice(2));
|
||||
for (const candidate of candidates) {
|
||||
try {
|
||||
const decoded = decodeBase64Url(candidate);
|
||||
if (/^https?:\/\//i.test(decoded)) return decoded;
|
||||
}
|
||||
catch { }
|
||||
}
|
||||
}
|
||||
}
|
||||
return url.toString();
|
||||
}
|
||||
catch {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
|
||||
async search(q, limit, lang) {
|
||||
const cacheKey = createCacheKey("bing", q, limit, lang);
|
||||
const cached = searchCache.get(cacheKey);
|
||||
if (cached)
|
||||
return cached;
|
||||
const market = getMarketFromLang(lang);
|
||||
const results = await browserPool.withBrowser(async (browser) => {
|
||||
const page = await browser.newPage();
|
||||
try {
|
||||
await page.setViewport({ width: 1365, height: 768 });
|
||||
await page.setUserAgent(DEFAULT_BROWSER_SEARCH_USER_AGENT);
|
||||
await page.setExtraHTTPHeaders(getAcceptLanguageHeader(lang));
|
||||
const url = new URL("https://www.bing.com/search");
|
||||
url.searchParams.set("q", q);
|
||||
url.searchParams.set("mkt", market);
|
||||
const response = await page.goto(url.toString(), {
|
||||
waitUntil: "domcontentloaded",
|
||||
timeout: PUPPETEER_TIMEOUT
|
||||
});
|
||||
if (response && response.status() >= 400) {
|
||||
throw new Error(`Bing HTTP ${response.status()}`);
|
||||
}
|
||||
await page.waitForSelector("li.b_algo h2 a[href], li.b_algo a[href]", { timeout: 10000 }).catch(() => undefined);
|
||||
const items = await page.evaluate(maxResults => {
|
||||
const parsed = [];
|
||||
for (const card of Array.from(document.querySelectorAll("li.b_algo"))) {
|
||||
const anchor = card.querySelector("h2 a[href]") || card.querySelector("a[href]");
|
||||
const title = anchor?.textContent?.trim() || "";
|
||||
const href = anchor?.getAttribute("href") || "";
|
||||
if (!title || !href)
|
||||
continue;
|
||||
const snippetElement = card.querySelector("div.b_caption p, div.b_snippet, p");
|
||||
const snippet = snippetElement?.textContent?.trim() || undefined;
|
||||
parsed.push({ title, url: href, snippet });
|
||||
if (parsed.length >= maxResults)
|
||||
break;
|
||||
}
|
||||
return parsed;
|
||||
}, limit);
|
||||
return items.flatMap(result => {
|
||||
try {
|
||||
const absolute = new URL(result.url, "https://www.bing.com/").toString();
|
||||
const decoded = this.decodeBingRedirect(absolute);
|
||||
new URL(decoded);
|
||||
return [{ ...result, url: decoded, source: "bing" }];
|
||||
}
|
||||
catch {
|
||||
return [];
|
||||
}
|
||||
});
|
||||
}
|
||||
finally {
|
||||
await page.close();
|
||||
}
|
||||
});
|
||||
searchCache.set(cacheKey, results);
|
||||
return results;
|
||||
}
|
||||
|
||||
async isAvailable() {
|
||||
try {
|
||||
await browserPool.getBrowser();
|
||||
return true;
|
||||
}
|
||||
catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
28
docker/web-search/patch-mcp-web-search.mjs
Normal file
28
docker/web-search/patch-mcp-web-search.mjs
Normal file
@@ -0,0 +1,28 @@
|
||||
import fs from "node:fs";
|
||||
|
||||
// Context Kit patch for @zhafron/mcp-web-search 1.3.0.
|
||||
// Upstream hard-codes the fetch_url schema limit to 25 MiB even though the
|
||||
// runtime extractor already uses MAX_BYTES. Keep this narrow and fail the build
|
||||
// if upstream changes the compiled source shape.
|
||||
const serverPath = "/usr/local/lib/node_modules/@zhafron/mcp-web-search/dist/src/server.js";
|
||||
let source = fs.readFileSync(serverPath, "utf8");
|
||||
|
||||
const replacements = [
|
||||
[
|
||||
'import { MAX_RESULTS } from "./constants.js";',
|
||||
'import { MAX_BYTES, MAX_RESULTS } from "./constants.js";'
|
||||
],
|
||||
[
|
||||
"max_download_bytes: z.number().int().min(1).max(26214400).optional()",
|
||||
"max_download_bytes: z.number().int().min(1).max(MAX_BYTES).optional()"
|
||||
]
|
||||
];
|
||||
|
||||
for (const [before, after] of replacements) {
|
||||
if (!source.includes(before)) {
|
||||
throw new Error(`mcp-web-search patch target not found: ${before}`);
|
||||
}
|
||||
source = source.replace(before, after);
|
||||
}
|
||||
|
||||
fs.writeFileSync(serverPath, source);
|
||||
@@ -15,8 +15,8 @@ search:
|
||||
- json
|
||||
|
||||
server:
|
||||
# Local placeholder. The Docker service also sets SEARXNG_SECRET from .env;
|
||||
# keep SearXNG bound to 127.0.0.1 unless you review this config separately.
|
||||
# Local placeholder. Keep SearXNG bound to 127.0.0.1 unless you review this
|
||||
# config and replace the secret_key for a deliberate non-local deployment.
|
||||
secret_key: "local-only-change-if-exposed"
|
||||
limiter: false
|
||||
image_proxy: true
|
||||
|
||||
Reference in New Issue
Block a user