使用nodeJS爬取一些测试图片

发布: 2019-03-06 15:12:20标签: nodeJs
01const fs = require('fs')
02const axios = require('axios')
03const cheerio = require('cheerio')
04
05const baseUrl = 'https://pixabay.com'
06
07let page = 1
08
09const url = 'https://pixabay.com/images/search/nature/?cat=science'
10
11function downloadImage(src) {
12 const name = src.substr(src.lastIndexOf('/') + 1)
13 console.log(`${src}开始下载`)
14 return new Promise(resolve => {
15 axios({
16 url: src,
17 responseType: 'stream'
18 })
19 .then(response => {
20 response.data.pipe(fs.createWriteStream(`images/${name}`))
21 console.log(`${name}下载成功`)
22 resolve()
23 })
24 .catch(e => {
25 console.log('图片下载错误', e)
26 resolve(e)
27 })
28 })
29}
30
31async function spider(url) {
32 console.log('开始收集图片', url)
33
34 const { data: html } = await axios.get(url)
35 const $ = cheerio.load(html)
36 const images = $('#content .media_list .search_results img')
37 const urls = []
38 images.map((index, item) => {
39 const src = $(item).attr('src')
40 const realSrc = src.startsWith('https')
41 ? src
42 : $(item)
43 .attr('data-lazy-srcset')
44 .split(' 1x,')[0]
45 urls.push(realSrc)
46 })
47
48 for (let i = 0; i < urls.length; i++) {
49 await downloadImage(urls[i])
50 }
51
52 // 下一页
53 if (page < 40) {
54 const href = $('#content .media_list>a.pure-button').attr('href')
55 page += 1
56 spider(baseUrl + href)
57 }
58}
59
60spider(url)
61
复制代码