go.jpをクロールするスクリプト
2024/2/24 23:19:00
import fs from "fs";
import * as R from "ramda";
import { Window } from "happy-dom";
import { urls } from "./urls";
const window = new Window();
const dom_parser = new window.DOMParser();
const queue = new Map<string, { got: boolean }>(
urls.map((url) => [url, { got: false }])
);
// filesのファイル一覧
const file_names = fs.readdirSync("./files");
const file_names_base64 = file_names.map((file_name) => {
const [url_base64, file_ext] = file_name.split(".");
return url_base64;
});
const got_url_base64_set = new Set<string>(file_names_base64);
console.info({ got_url_base64_set });
for (;;) {
try {
const urls_for_scraping_groupby = R.groupBy((url) =>
url.includes("pdf") ? "pdf" : "else"
)([...queue].filter(([url, { got }]) => !got).map(([url, { got }]) => url));
const urls_for_scraping = [
...(urls_for_scraping_groupby.pdf ?? []),
...(urls_for_scraping_groupby.else ?? []),
];
if (urls_for_scraping.length === 0) {
break;
}
const url = urls_for_scraping[0];
console.info({ url });
const url_base64 = Buffer.from(url).toString("base64");
if (got_url_base64_set.has(url_base64)) {
queue.set(url, { got: true });
continue;
}
queue.set(url, { got: true });
await sleep(2000);
const fetch_result = await fetch(url);
const file_type = fetch_result.headers.get("content-type");
if (file_type === null) {
console.warn("file type is null");
continue;
}
// 拡張子
const file_ext = file_type.split("/")[1];
// urlをbase64にしてファイル名にする
const file_name = `./files/${url_base64}.${file_ext}`;
if (file_ext == "html") {
const file_text = await fetch_result.text();
const urls_next = await links_get_from_html_text(file_text);
// Mapに追加
urls_next
.filter((url) => !queue.has(url) && url.includes("go.jp"))
.forEach((url) => {
queue.set(url, { got: false });
});
if (!file_text) {
console.warn("file is null");
continue;
}
await fs.writeFile(file_name, file_text, () => {});
}
if (file_ext == "pdf") {
const file_name = `./files/${crypto.randomUUID()}.${file_ext}`;
const array_buffer = await fetch_result.arrayBuffer();
await fs.writeFile(file_name, Buffer.from(array_buffer), () => {});
}
} catch (error) {
console.error("catch", error);
}
}
async function sleep(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function links_get_from_html_text(text: string) {
const dom = dom_parser.parseFromString(text, "text/html");
const a_elems = dom.querySelectorAll(`a`);
const urls = a_elems
.map((a: any) => a.href)
.filter((href) => href?.includes("go.jp"));
return urls;
}