174 lines
4.8 KiB
JavaScript
174 lines
4.8 KiB
JavaScript
import { chromium } from 'playwright';
|
|
import path from 'path';
|
|
import fs from 'fs';
|
|
|
|
const URL_BROWSER = 'ws://localhost:9222/devtools/browser/716e665f-22f3-4c81-b68d-71f7ab9d72f6'
|
|
|
|
const USERNAME = 'protest'
|
|
const MDP = 'Avalanche1'
|
|
|
|
const URL_FORUMS = "https://www.avalancheassociation.ca/forums/"
|
|
const PATH_ENTREE = 'Topics.aspx?forum=173521&group=135525'
|
|
|
|
const SEPARATEUR_POST = "<br> <hr> <br>"
|
|
|
|
|
|
function getRandomDelay(min = 1000, max = 3000) {
|
|
return Math.floor(Math.random() * (max - min + 1)) + min;
|
|
}
|
|
|
|
async function login(page) {
|
|
await page.goto('https://www.avalancheassociation.ca/');
|
|
|
|
await page.waitForTimeout(getRandomDelay());
|
|
await page.getByPlaceholder('Username').fill(USERNAME);
|
|
await page.waitForTimeout(getRandomDelay());
|
|
await page.getByPlaceholder('Password').fill(MDP);
|
|
|
|
await page.waitForTimeout(getRandomDelay());
|
|
await Promise.all([
|
|
page.click('input[name="btn_submitLogin"]'),
|
|
page.waitForNavigation({ waitUntil: 'networkidle0' }),
|
|
]);
|
|
}
|
|
|
|
async function genererPDF(page) {
|
|
let cleanHTML = "";
|
|
const titreThread = await getPageTitle(page)
|
|
const urlData = 'data/forums/' + titreThread
|
|
|
|
console.log(titreThread)
|
|
await downloadAttachedDocuments(page, urlData)
|
|
|
|
do {
|
|
const postsLocator = await page.locator('.FormTable1')
|
|
const count = await postsLocator.count();
|
|
|
|
for(let i = 0; i < count; i++) {
|
|
const post = postsLocator.nth(i);
|
|
|
|
await post.evaluate(p => {
|
|
const sectionInutiles = p.querySelectorAll('td[id*="tdButtonBar"], div.forumPosterPostCount');
|
|
if (sectionInutiles) {
|
|
sectionInutiles.forEach(s => s.remove());
|
|
}
|
|
});
|
|
|
|
const html = await post.innerHTML()
|
|
cleanHTML += html?.split('<br>')
|
|
.map(line => line.trim())
|
|
.filter(line => line !== '')
|
|
.join('<br>');
|
|
|
|
cleanHTML += SEPARATEUR_POST
|
|
|
|
}
|
|
}
|
|
while(await goToNextPage(page))
|
|
|
|
const pdfPage = await page.context().newPage();
|
|
await pdfPage.setContent(cleanHTML);
|
|
await pdfPage.pdf({ path: urlData + '/' + titreThread + '.pdf', format: 'A4' });
|
|
await pdfPage.close()
|
|
}
|
|
|
|
async function goToNextPage(page) {
|
|
await page.waitForTimeout(getRandomDelay());
|
|
const nextPage = await page.$('#ctl00_PageContent_fnb_PostPager_pnlNextPage');
|
|
|
|
|
|
if (nextPage !== null) {
|
|
await Promise.all([
|
|
page.waitForNavigation({ waitUntil: 'load' }),
|
|
nextPage.click()
|
|
]);
|
|
return true
|
|
}
|
|
else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
async function getPageTitle(page) {
|
|
const titre = await page.$('#ctl00_PageContent_PageSummaryTitle');
|
|
if (titre !== null)
|
|
return titre.textContent()
|
|
}
|
|
|
|
|
|
async function downloadAttachedDocuments(page, urlDownloadFolder) {
|
|
const attachmentDivs = await page.locator('div[id*="pnlAttachments"]').all();
|
|
console.log('DL path: ', urlDownloadFolder )
|
|
if (!fs.existsSync(urlDownloadFolder)) {
|
|
fs.mkdirSync(urlDownloadFolder);
|
|
}
|
|
|
|
for (let i = 0; i < attachmentDivs.length; i++) {
|
|
const div = attachmentDivs[i];
|
|
|
|
// Get all links within the div
|
|
const links = await div.locator('a').all();
|
|
|
|
for (let j = 0; j < links.length; j++) {
|
|
const link = links[j];
|
|
const hrefAttachement = await link.getAttribute('href');
|
|
const fileName = await link.textContent();
|
|
|
|
if (hrefAttachement) {
|
|
const downloadPromise = page.waitForEvent('download');
|
|
|
|
await link.click({ modifiers: ['Alt'] });
|
|
const download = await downloadPromise;
|
|
|
|
const downloadPath = path.join(urlDownloadFolder, fileName);
|
|
await download.saveAs(downloadPath);
|
|
console.log(`Downloaded PDF: ${downloadPath}`);
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
|
|
const browser = await chromium.connectOverCDP(URL_BROWSER);
|
|
const contexts = browser.contexts();
|
|
|
|
if (contexts.length > 0) {
|
|
const pages = await contexts[0].pages();
|
|
|
|
if (pages.length > 0) {
|
|
let page = pages[0]
|
|
|
|
await login(page)
|
|
await page.goto(URL_FORUMS + PATH_ENTREE)
|
|
await goToNextPage(page) //TODO remove
|
|
|
|
//POUR CHAQUE PAGES
|
|
do {
|
|
const liensThreads = await page.locator('a[href*="Posts.aspx?topic="]')
|
|
const count = await liensThreads.count();
|
|
|
|
//POUR CHAQUE THREADS
|
|
for (let i = 0; i < count; i++) {
|
|
if (i % 2 !== 0) continue //Filtre le lien vers la dernière page
|
|
const href = await liensThreads.nth(i).getAttribute('href');
|
|
const newTab = await page.context().newPage()
|
|
|
|
await newTab.waitForTimeout(getRandomDelay());
|
|
await newTab.goto(URL_FORUMS + href)
|
|
await genererPDF(newTab)
|
|
|
|
await newTab.waitForTimeout(getRandomDelay());
|
|
await newTab.close()
|
|
}
|
|
}
|
|
while(await goToNextPage(page))
|
|
} else {
|
|
console.log('No pages found in the context');
|
|
}
|
|
} else {
|
|
console.log('No contexts available');
|
|
}
|
|
|
|
|