使用nodeJS爬取一些测试图片
发布: 2019-03-06 15:12:20标签: nodeJs
01const fs = require('fs')02const axios = require('axios')03const cheerio = require('cheerio')0405const baseUrl = 'https://pixabay.com'0607let page = 10809const url = 'https://pixabay.com/images/search/nature/?cat=science'1011function downloadImage(src) {12 const name = src.substr(src.lastIndexOf('/') + 1)13 console.log(`${src}开始下载`)14 return new Promise(resolve => {15 axios({16 url: src,17 responseType: 'stream'18 })19 .then(response => {20 response.data.pipe(fs.createWriteStream(`images/${name}`))21 console.log(`${name}下载成功`)22 resolve()23 })24 .catch(e => {25 console.log('图片下载错误', e)26 resolve(e)27 })28 })29}3031async function spider(url) {32 console.log('开始收集图片', url)3334 const { data: html } = await axios.get(url)35 const $ = cheerio.load(html)36 const images = $('#content .media_list .search_results img')37 const urls = []38 images.map((index, item) => {39 const src = $(item).attr('src')40 const realSrc = src.startsWith('https')41 ? src42 : $(item)43 .attr('data-lazy-srcset')44 .split(' 1x,')[0]45 urls.push(realSrc)46 })4748 for (let i = 0; i < urls.length; i++) {49 await downloadImage(urls[i])50 }5152 // 下一页53 if (page < 40) {54 const href = $('#content .media_list>a.pure-button').attr('href')55 page += 156 spider(baseUrl + href)57 }58}5960spider(url)61
复制代码