DevOps技能Top10

数据来源: https://devops-jobs.net/

const fs = require('fs/promises');
const puppeteer = require('puppeteer');

(async () => {
  console.log('open browser');
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('https://devops-jobs.net/');
  while (true) {
    let elements = await page.$$('div.list-group-item');
    let count = elements.length;
    let loadMore = await page.$('#load-more-jobs');
    if (loadMore === null) {
      console.log(`total jobs: ${count}`);
      console.log('write file: output.jl');
      for (const e of elements) {
        let item = await e.evaluate(e => ({
          'url': $('a.list-group-item-action', e).attr('href'),
          'title': $('h5', e).first().text(),
          'logo': $('img[alt$="logo"]', e).attr('src'),
          'company': $('p.job-list-item-company', e).first().text(),
          'location': $('span.job-list-item-location', e).first().text(),
          'tags': $('span.badge-light', e).get().map(e => e.innerText),
          'timestmap': $('time', e).attr('datetime'),
        }));
        await fs.appendFile('output.jl', JSON.stringify(item) + '\n');
      }
      break;
    } else {
      console.log(`load more: ${count} ...`);
      await loadMore.click();
      await page.waitForFunction((count) => {
        return document.querySelectorAll('div.list-group-item').length > count;
      }, {}, count);
    }
  }
  console.log('close browser');
  await browser.close();
})();
$ node crawler.js
open browser
load more: 60 ...
load more: 120 ...
load more: 180 ...
load more: 240 ...
load more: 300 ...
load more: 360 ...
load more: 420 ...
load more: 480 ...
load more: 540 ...
load more: 600 ...
load more: 660 ...
load more: 720 ...
load more: 780 ...
load more: 840 ...
load more: 900 ...
load more: 960 ...
total jobs: 1000
write file: output.jl
close browser

$ cat output.jl | jq -r .tags[] | sort | uniq -c | sort -nr | head
    624 AWS
    614 Python
    475 Linux
    432 Kubernetes
    399 Terraform
    365 CI
    340 CD
    322 Docker
    318 Ansible
    282 Go

$ cat output.jl | jq -s 'map(.tags[])|group_by(.)|sort_by(length)|reverse[:10]|map({(.[0]):length})|add'
{
  "AWS": 624,
  "Python": 614,
  "Linux": 475,
  "Kubernetes": 432,
  "Terraform": 399,
  "CI": 365,
  "CD": 340,
  "Docker": 322,
  "Ansible": 318,
  "Go": 282
}