Files
2025-09-13 14:08:03 +02:00

162 lines
5.2 KiB
JavaScript

// sniff_puppeteer_save.js
// npm install puppeteer
// node sniff_puppeteer_save.js
const fs = require('fs').promises;
const path = require('path');
const puppeteer = require('puppeteer');
//// CONFIG ////
// Set to '' to disable prefix filtering
const targetUrl = 'https://sushiscan.net/la-gardienne-des-concubines-volume-1/'
const targetPrefix = 'https://c.sushiscan.net/wp-content/';
// Extensions to allow (set to [] to disable extension filtering)
// include the leading dot, lowercase
const extensions = ['.webp']; // e.g. ['.webp', '.jpg'] or [] for no extension filter
// Where to save files
const DATA_DIR = path.join(__dirname, 'data');
//// Helpers ////
function hasExtension(url, exts) {
if (!exts || exts.length === 0) return true;
try {
const p = new URL(url).pathname.toLowerCase();
return exts.some(ext => p.endsWith(ext.toLowerCase()));
} catch (e) {
return false;
}
}
function lastPathSegment(url) {
try {
const p = new URL(url).pathname;
// split by '/', filter out empty segments so trailing slash -> last segment before slash
const segs = p.split('/').filter(Boolean);
if (segs.length === 0) return 'index';
return segs[segs.length - 1];
} catch (e) {
// fallback
const noQuery = url.split('?')[0].split('#')[0];
const segs = noQuery.split('/').filter(Boolean);
return segs.length ? segs[segs.length - 1] : 'index';
}
}
function sanitizeFilename(name) {
// allow alphanum, dot, dash, underscore; replace others with underscore
const maxLen = 200;
let safe = name.replace(/[^a-zA-Z0-9.\-_]/g, '_');
if (safe.length > maxLen) safe = safe.slice(0, maxLen);
// avoid filenames starting with dot (hidden)
if (safe.startsWith('.')) safe = 'file' + safe;
if (!safe) safe = 'index';
return safe;
}
async function uniqueFilename(dir, base) {
// base already sanitized
let candidate = base;
let i = 0;
while (true) {
const full = path.join(dir, candidate);
try {
await fs.access(full);
// exists -> bump
i += 1;
const ext = path.extname(base);
const nameOnly = path.basename(base, ext);
candidate = `${nameOnly}-${i}${ext}`;
} catch (err) {
// does not exist -> good
return candidate;
}
}
}
async function ensureDataDir() {
await fs.mkdir(DATA_DIR, { recursive: true });
}
//// Main ////
(async () => {
await ensureDataDir();
const browser = await puppeteer.launch({
headless: false, // headless:true won't have a visible window
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', // path to your Chrome
defaultViewport: null, // disables the default small viewport
args: [
'--start-maximized',
'--disable-blink-features=AutomationControlled', // hides navigator.webdriver
],
ignoreDefaultArgs: ['--enable-automation'], // remove automation flag
});
const page = await browser.newPage();
page.on('response', async (response) => {
const url = response.url();
try {
if (targetPrefix && targetPrefix.length && !url.startsWith(targetPrefix)) return;
if (!hasExtension(url, extensions)) return;
// get last path segment
const rawName = lastPathSegment(url);
let filename = sanitizeFilename(rawName);
// ensure we preserve extension if the path segment lacks it but content-type indicates one:
const contentType = response.headers()['content-type'] || '';
const extFromPath = path.extname(filename);
if (!extFromPath && contentType) {
// minimal mapping for common image types; extend as needed
if (contentType.includes('image/webp')) filename += '.webp';
else if (contentType.includes('image/png')) filename += '.png';
else if (contentType.includes('image/jpeg')) filename += '.jpg';
else if (contentType.includes('text/html')) filename += '.html';
else if (contentType.includes('application/json')) filename += '.json';
// else leave as-is
}
filename = await uniqueFilename(DATA_DIR, filename);
const outPath = path.join(DATA_DIR, filename);
// read body as buffer (works for binary and text)
let buffer;
try {
buffer = await response.buffer();
} catch (err) {
console.error('Could not read response body for', url, err.message);
return;
}
// write file
await fs.writeFile(outPath, buffer);
// write metadata
const meta = {
url,
status: response.status(),
headers: response.headers(),
savedAt: new Date().toISOString(),
filename,
};
await fs.writeFile(outPath + '.meta.json', JSON.stringify(meta, null, 2));
console.log(`Saved: ${outPath} (bytes: ${buffer.length})`);
} catch (err) {
console.error('Error handling response', url, err);
}
});
// optional: log requests too (for debugging)
page.on('request', (req) => {
// console.log('REQ', req.method(), req.url());
});
// navigate to a page that will produce the requests you want to capture
// change to a real page that triggers the resources
await page.goto("https://google.com", { waitUntil: 'networkidle2', timeout: 300_000 });
console.log('Listening for responses. Press Ctrl+C to stop.');
})();