热搜爬虫

准备工作

$ npm install puppeteer-core

两种使用方式:

微博爬虫

const fs = require('fs/promises');
const puppeteer = require('puppeteer-core');

(async () => {
  const browser = await puppeteer.launch({
    headless: true,
    executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
    args: ['--disable-notifications']
  });
  const [page] = await browser.pages();
  const [textbox] = await Promise.all([
    page.waitForSelector('.gn_search_v2>.W_input'),
    page.waitForResponse(response => response.url().includes('https://s.weibo.com/ajax/jsonp/gettopsug')),
    page.goto('https://weibo.com/'),
  ]);
  await textbox.click();
  await page.waitForSelector('.gn_topmenulist_search', {visible: true});
  const element = await page.$('.gn_topmenulist_search');
  const content = await page.$eval('.gn_topmenulist_search', e => e.outerHTML);
  const hotlist = await page.$$eval(
    '.gn_topmenulist_search li[suda-data*="hotword"]',
    e => e.map(x => x.getAttribute('action-data').split('text=')[1].replace(/#/g, ''))
  );
  await element.screenshot({path: 'weibo.png'});
  await fs.writeFile('weibo.html', content);
  await fs.writeFile('weibo.json', JSON.stringify(hotlist, null, 2));
  await browser.close();
})();

百度爬虫

const fs = require('fs/promises');
const puppeteer = require('puppeteer-core');

(async () => {
  const browser = await puppeteer.connect({
    browserWSEndpoint: `ws://172.16.3.21:3000/`,
  });
  const [page] = await browser.pages();
  await page.setRequestInterception(true);
  page.on('request', (request) => {
    if (request.url().includes('cdn.bcebos.com')) {
      request.abort();
    } else {
      request.continue();
    }
  });
  await page.goto('https://top.baidu.com/board?tab=realtime');
  await page.addStyleTag({content: 'a[class^="look-more_"] {display: none;}'});
  const timestamp = new Date().getTime();
  const results = await page.$$eval('div[class^="category-wrap_"]', (es, timestamp) => es.map(e => {
    let index = e.querySelector('a[class^="img-wrapper_"] div[class^="index_"]').innerText.trim();
    let image = e.querySelector('a[class^="img-wrapper_"] img').getAttribute('src');
    let subject = e.querySelector('div[class^="content_"] .c-single-text-ellipsis').innerText.trim();
    let summary = e.querySelector('div[class^="content_"] div[class^="hot-desc_"]').innerText.trim();
    let score = e.querySelector('div[class^="hot-index_"]').innerText.trim();
    let link = e.querySelector('a[class^="look-more_"]').getAttribute('href');
    return {
      index: parseInt(index),
      image,
      subject,
      summary,
      score: parseInt(score),
      link,
      timestamp,
    };
  }), timestamp);
  results.forEach(item => {
    console.log(`${item.index}\t${item.subject}\t(${item.score})`);
  });
  await fs.writeFile('baidu.json', JSON.stringify(results, null, 2));
  await browser.close();
})();

运行结果

$ node weibo.js

$ ls
weibo.html
weibo.js
weibo.json
weibo.png

$ cat weibo.json
[
  "勿忘九一八",
  "九一八90周年",
  "郑州女子按摩1小时花费19.8万",
  "勿忘国耻",
  "她们仍在等道歉",
  "防空警报",
  "为什么要强调十四年抗战",
  "成龙电影情节真实再现",
  "电影版寻秦记定档",
  "杨倩开枪前眼神杀"
]

$ imgcat weibo.png
$ node baidu.js
1 31省区市新增本土确诊43例 在福建 (4911816)
2 官方:定期开展对明星艺人税收检查 (4876207)
3 猪肉价格为何一降再降仍未探底    (4733629)
4 福建本轮累计报告本土确诊335例   (4684737)
5 法外长:撕毁协议是美澳对法的藐视 (4569199)

$ ls
baidu.js
baidu.json

$ cat baidu.json
[
  {
    "index": 1,
    "image": "https://fyb-1.cdn.bcebos.com/fyb-1//d0f97ee9b6cfd8a48a32f6735a7c585a?x-bce-process=image/resize,m_fill,w_256,h_170",
    "subject": "31省区市新增本土确诊43例 在福建",
    "summary": "9月18日0—24时,31个省(自治区、直辖市)和新疆生产建设兵团报告新增本土确诊病例43例(均在福建,其中厦门市39例...",
    "score": 4911816,
    "link": "https://www.baidu.com/s?wd=31%E7%9C%81%E5%8C%BA%E5%B8%82%E6%96%B0%E5%A2%9E%E6%9C%AC%E5%9C%9F%E7%A1%AE%E8%AF%8A43%E4%BE%8B+%E5%9C%A8%E7%A6%8F%E5%BB%BA&rsv_dl=fyb_news",
    "timestamp": 1632026082045
  },
  {
    "index": 2,
    "image": "https://fyb-1.cdn.bcebos.com/fyb-1//70147692153c737874e6b467753e503e?x-bce-process=image/resize,m_fill,w_256,h_170",
    "subject": "官方:定期开展对明星艺人税收检查",
    "summary": "近日,国家税务总局办公厅发布通知,加强文娱领域从业人员税收管理,定期对明星艺人、网络主播“双随机、一公开”税收检查。",
    "score": 4876207,
    "link": "https://www.baidu.com/s?wd=%E5%AE%98%E6%96%B9%3A%E5%AE%9A%E6%9C%9F%E5%BC%80%E5%B1%95%E5%AF%B9%E6%98%8E%E6%98%9F%E8%89%BA%E4%BA%BA%E7%A8%8E%E6%94%B6%E6%A3%80%E6%9F%A5&rsv_dl=fyb_news",
    "timestamp": 1632026082045
  },
  ...
]