Big brain code
This commit is contained in:
161
index.js
Normal file
161
index.js
Normal file
@@ -0,0 +1,161 @@
|
||||
// sniff_puppeteer_save.js
|
||||
// npm install puppeteer
|
||||
// node sniff_puppeteer_save.js
|
||||
|
||||
const fs = require('fs').promises;
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
//// CONFIG ////
|
||||
// Set to '' to disable prefix filtering
|
||||
const targetUrl = 'https://sushiscan.net/la-gardienne-des-concubines-volume-1/'
|
||||
const targetPrefix = 'https://c.sushiscan.net/wp-content/';
|
||||
|
||||
// Extensions to allow (set to [] to disable extension filtering)
|
||||
// include the leading dot, lowercase
|
||||
const extensions = ['.webp']; // e.g. ['.webp', '.jpg'] or [] for no extension filter
|
||||
|
||||
// Where to save files
|
||||
const DATA_DIR = path.join(__dirname, 'data');
|
||||
|
||||
//// Helpers ////
|
||||
function hasExtension(url, exts) {
|
||||
if (!exts || exts.length === 0) return true;
|
||||
try {
|
||||
const p = new URL(url).pathname.toLowerCase();
|
||||
return exts.some(ext => p.endsWith(ext.toLowerCase()));
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function lastPathSegment(url) {
|
||||
try {
|
||||
const p = new URL(url).pathname;
|
||||
// split by '/', filter out empty segments so trailing slash -> last segment before slash
|
||||
const segs = p.split('/').filter(Boolean);
|
||||
if (segs.length === 0) return 'index';
|
||||
return segs[segs.length - 1];
|
||||
} catch (e) {
|
||||
// fallback
|
||||
const noQuery = url.split('?')[0].split('#')[0];
|
||||
const segs = noQuery.split('/').filter(Boolean);
|
||||
return segs.length ? segs[segs.length - 1] : 'index';
|
||||
}
|
||||
}
|
||||
|
||||
function sanitizeFilename(name) {
|
||||
// allow alphanum, dot, dash, underscore; replace others with underscore
|
||||
const maxLen = 200;
|
||||
let safe = name.replace(/[^a-zA-Z0-9.\-_]/g, '_');
|
||||
if (safe.length > maxLen) safe = safe.slice(0, maxLen);
|
||||
// avoid filenames starting with dot (hidden)
|
||||
if (safe.startsWith('.')) safe = 'file' + safe;
|
||||
if (!safe) safe = 'index';
|
||||
return safe;
|
||||
}
|
||||
|
||||
async function uniqueFilename(dir, base) {
|
||||
// base already sanitized
|
||||
let candidate = base;
|
||||
let i = 0;
|
||||
while (true) {
|
||||
const full = path.join(dir, candidate);
|
||||
try {
|
||||
await fs.access(full);
|
||||
// exists -> bump
|
||||
i += 1;
|
||||
const ext = path.extname(base);
|
||||
const nameOnly = path.basename(base, ext);
|
||||
candidate = `${nameOnly}-${i}${ext}`;
|
||||
} catch (err) {
|
||||
// does not exist -> good
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function ensureDataDir() {
|
||||
await fs.mkdir(DATA_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
//// Main ////
|
||||
(async () => {
|
||||
await ensureDataDir();
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: false, // headless:true won't have a visible window
|
||||
executablePath: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', // path to your Chrome
|
||||
defaultViewport: null, // disables the default small viewport
|
||||
args: [
|
||||
'--start-maximized',
|
||||
'--disable-blink-features=AutomationControlled', // hides navigator.webdriver
|
||||
],
|
||||
ignoreDefaultArgs: ['--enable-automation'], // remove automation flag
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
try {
|
||||
if (targetPrefix && targetPrefix.length && !url.startsWith(targetPrefix)) return;
|
||||
if (!hasExtension(url, extensions)) return;
|
||||
|
||||
// get last path segment
|
||||
const rawName = lastPathSegment(url);
|
||||
let filename = sanitizeFilename(rawName);
|
||||
|
||||
// ensure we preserve extension if the path segment lacks it but content-type indicates one:
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
const extFromPath = path.extname(filename);
|
||||
if (!extFromPath && contentType) {
|
||||
// minimal mapping for common image types; extend as needed
|
||||
if (contentType.includes('image/webp')) filename += '.webp';
|
||||
else if (contentType.includes('image/png')) filename += '.png';
|
||||
else if (contentType.includes('image/jpeg')) filename += '.jpg';
|
||||
else if (contentType.includes('text/html')) filename += '.html';
|
||||
else if (contentType.includes('application/json')) filename += '.json';
|
||||
// else leave as-is
|
||||
}
|
||||
|
||||
filename = await uniqueFilename(DATA_DIR, filename);
|
||||
const outPath = path.join(DATA_DIR, filename);
|
||||
|
||||
// read body as buffer (works for binary and text)
|
||||
let buffer;
|
||||
try {
|
||||
buffer = await response.buffer();
|
||||
} catch (err) {
|
||||
console.error('Could not read response body for', url, err.message);
|
||||
return;
|
||||
}
|
||||
|
||||
// write file
|
||||
await fs.writeFile(outPath, buffer);
|
||||
// write metadata
|
||||
const meta = {
|
||||
url,
|
||||
status: response.status(),
|
||||
headers: response.headers(),
|
||||
savedAt: new Date().toISOString(),
|
||||
filename,
|
||||
};
|
||||
await fs.writeFile(outPath + '.meta.json', JSON.stringify(meta, null, 2));
|
||||
|
||||
console.log(`Saved: ${outPath} (bytes: ${buffer.length})`);
|
||||
} catch (err) {
|
||||
console.error('Error handling response', url, err);
|
||||
}
|
||||
});
|
||||
|
||||
// optional: log requests too (for debugging)
|
||||
page.on('request', (req) => {
|
||||
// console.log('REQ', req.method(), req.url());
|
||||
});
|
||||
|
||||
// navigate to a page that will produce the requests you want to capture
|
||||
// change to a real page that triggers the resources
|
||||
await page.goto("https://google.com", { waitUntil: 'networkidle2', timeout: 300_000 });
|
||||
|
||||
console.log('Listening for responses. Press Ctrl+C to stop.');
|
||||
})();
|
||||
Reference in New Issue
Block a user