162 lines
5.2 KiB
JavaScript
162 lines
5.2 KiB
JavaScript
// sniff_puppeteer_save.js
|
|
// npm install puppeteer
|
|
// node sniff_puppeteer_save.js
|
|
|
|
const fs = require('fs').promises;
|
|
const path = require('path');
|
|
const puppeteer = require('puppeteer');
|
|
|
|
//// CONFIG ////
|
|
// Set to '' to disable prefix filtering
|
|
const targetUrl = 'https://sushiscan.net/la-gardienne-des-concubines-volume-1/'
|
|
const targetPrefix = 'https://c.sushiscan.net/wp-content/';
|
|
|
|
// Extensions to allow (set to [] to disable extension filtering)
|
|
// include the leading dot, lowercase
|
|
const extensions = ['.webp']; // e.g. ['.webp', '.jpg'] or [] for no extension filter
|
|
|
|
// Where to save files
|
|
const DATA_DIR = path.join(__dirname, 'data');
|
|
|
|
//// Helpers ////
|
|
function hasExtension(url, exts) {
|
|
if (!exts || exts.length === 0) return true;
|
|
try {
|
|
const p = new URL(url).pathname.toLowerCase();
|
|
return exts.some(ext => p.endsWith(ext.toLowerCase()));
|
|
} catch (e) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
function lastPathSegment(url) {
|
|
try {
|
|
const p = new URL(url).pathname;
|
|
// split by '/', filter out empty segments so trailing slash -> last segment before slash
|
|
const segs = p.split('/').filter(Boolean);
|
|
if (segs.length === 0) return 'index';
|
|
return segs[segs.length - 1];
|
|
} catch (e) {
|
|
// fallback
|
|
const noQuery = url.split('?')[0].split('#')[0];
|
|
const segs = noQuery.split('/').filter(Boolean);
|
|
return segs.length ? segs[segs.length - 1] : 'index';
|
|
}
|
|
}
|
|
|
|
function sanitizeFilename(name) {
|
|
// allow alphanum, dot, dash, underscore; replace others with underscore
|
|
const maxLen = 200;
|
|
let safe = name.replace(/[^a-zA-Z0-9.\-_]/g, '_');
|
|
if (safe.length > maxLen) safe = safe.slice(0, maxLen);
|
|
// avoid filenames starting with dot (hidden)
|
|
if (safe.startsWith('.')) safe = 'file' + safe;
|
|
if (!safe) safe = 'index';
|
|
return safe;
|
|
}
|
|
|
|
async function uniqueFilename(dir, base) {
|
|
// base already sanitized
|
|
let candidate = base;
|
|
let i = 0;
|
|
while (true) {
|
|
const full = path.join(dir, candidate);
|
|
try {
|
|
await fs.access(full);
|
|
// exists -> bump
|
|
i += 1;
|
|
const ext = path.extname(base);
|
|
const nameOnly = path.basename(base, ext);
|
|
candidate = `${nameOnly}-${i}${ext}`;
|
|
} catch (err) {
|
|
// does not exist -> good
|
|
return candidate;
|
|
}
|
|
}
|
|
}
|
|
|
|
async function ensureDataDir() {
|
|
await fs.mkdir(DATA_DIR, { recursive: true });
|
|
}
|
|
|
|
//// Main ////
|
|
(async () => {
|
|
await ensureDataDir();
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: false, // headless:true won't have a visible window
|
|
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', // path to your Chrome
|
|
defaultViewport: null, // disables the default small viewport
|
|
args: [
|
|
'--start-maximized',
|
|
'--disable-blink-features=AutomationControlled', // hides navigator.webdriver
|
|
],
|
|
ignoreDefaultArgs: ['--enable-automation'], // remove automation flag
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
page.on('response', async (response) => {
|
|
const url = response.url();
|
|
try {
|
|
if (targetPrefix && targetPrefix.length && !url.startsWith(targetPrefix)) return;
|
|
if (!hasExtension(url, extensions)) return;
|
|
|
|
// get last path segment
|
|
const rawName = lastPathSegment(url);
|
|
let filename = sanitizeFilename(rawName);
|
|
|
|
// ensure we preserve extension if the path segment lacks it but content-type indicates one:
|
|
const contentType = response.headers()['content-type'] || '';
|
|
const extFromPath = path.extname(filename);
|
|
if (!extFromPath && contentType) {
|
|
// minimal mapping for common image types; extend as needed
|
|
if (contentType.includes('image/webp')) filename += '.webp';
|
|
else if (contentType.includes('image/png')) filename += '.png';
|
|
else if (contentType.includes('image/jpeg')) filename += '.jpg';
|
|
else if (contentType.includes('text/html')) filename += '.html';
|
|
else if (contentType.includes('application/json')) filename += '.json';
|
|
// else leave as-is
|
|
}
|
|
|
|
filename = await uniqueFilename(DATA_DIR, filename);
|
|
const outPath = path.join(DATA_DIR, filename);
|
|
|
|
// read body as buffer (works for binary and text)
|
|
let buffer;
|
|
try {
|
|
buffer = await response.buffer();
|
|
} catch (err) {
|
|
console.error('Could not read response body for', url, err.message);
|
|
return;
|
|
}
|
|
|
|
// write file
|
|
await fs.writeFile(outPath, buffer);
|
|
// write metadata
|
|
const meta = {
|
|
url,
|
|
status: response.status(),
|
|
headers: response.headers(),
|
|
savedAt: new Date().toISOString(),
|
|
filename,
|
|
};
|
|
await fs.writeFile(outPath + '.meta.json', JSON.stringify(meta, null, 2));
|
|
|
|
console.log(`Saved: ${outPath} (bytes: ${buffer.length})`);
|
|
} catch (err) {
|
|
console.error('Error handling response', url, err);
|
|
}
|
|
});
|
|
|
|
// optional: log requests too (for debugging)
|
|
page.on('request', (req) => {
|
|
// console.log('REQ', req.method(), req.url());
|
|
});
|
|
|
|
// navigate to a page that will produce the requests you want to capture
|
|
// change to a real page that triggers the resources
|
|
await page.goto("https://google.com", { waitUntil: 'networkidle2', timeout: 300_000 });
|
|
|
|
console.log('Listening for responses. Press Ctrl+C to stop.');
|
|
})();
|