【Ruby爬虫01】某吃瓜网站图片数据采集

介绍

由于最近在学习Ruby，写一个爬虫锻炼一下。涉及xml解析、多线程、xpath语法等基础知识。

实现代码

使用说明

使用前请先安装如下gem

gem install  nokogiri http openssl

# nokogiri：一个解析xml和html的库，支持css、xpath语法
# http：一个发送http请求的库

源代码

require 'nokogiri'  
require 'openssl'  
require 'time'  
require 'http'  
require 'thread'  

# 由于网站涉不良内容，网站已编码，自行研究解码方式  
BASE_URL = 'l5VKR[9`aI10.P;m*LzIh,]@P17&0^F' 
  
  
# AES-128-CBC解密 ，网站图片有加密，需要解密 
def aes_128_cbc_decrypt(encrypted_data, key = 'f5d965df75336270', iv = '97b60394abc2fbe1')  
  aes = OpenSSL::Cipher.new('aes-128-cbc')  
  aes.decrypt  
  aes.key = key  
  aes.iv = iv  
  aes.padding = 0  # 禁用填充  
  aes.update(encrypted_data) + aes.final  
end  
  
  
# 获取页面  
def get_page_doc(page_url)   
  begin    # 使用HTTP.follow自动跟随重定向  
    resp = HTTP.follow.get(page_url)  
    # 转换为doc  
    doc = Nokogiri::HTML(resp.body.to_s)  
  rescue Exception => e  
    puts e.message  
  end  
  doc  
end  
  
# 获取列表页面  
def fetch_list_urls(doc)  
  page_list = []  
  urls = []  
  infos = []  
  
  # 获取页面链接地址  
  doc.xpath('//*[@id="archive"]/article/a/@href').each do |link|  
    # 添加  
    urls << BASE_URL + link  
  end  
  
  # 匹配标题及发布时间  
  doc.xpath('//*[@class="post-card"]/div[2]/div').each do |title|  
    info = {}  
    if title.content.gsub(/\s+/,'')!=''  
      # 获取标题  
      t = title.xpath('h2[@class="post-card-title"]/text()')[0].content  
      # 获取发布时间  
      time_str = title.xpath('div[@class="post-card-info"]/span[2]/@content')[0].content  
      publish_time = Time.parse(time_str).strftime('%Y/%m/%d')  
  
      info['title'] , info['publish_time']= t ,publish_time  
      infos << info  
    else  
      # 内容为空的都为广告  
      info['title'], info['publish_time'] = '',''  
      infos << info  
    end  
  end  
  # 转换hash对象  
  urls.each_with_index do |url, i|  
    page= {'url' => url,'title'=>infos[i]['title'],'publish_time'=> infos[i]['publish_time']}  
    page_list << page  
  end  
  # 返回page_list  
  page_list  
end  
  
  
# 获取某一页的图片  
def fetch_page(title,page_url)   
  doc = get_page_doc(page_url)  
  # 去除特殊字符，不然创建目录会失败，windows环境  
  title = title.gsub(/[“”：、\-*<>?\|\/？!！\s]*/,'')  
  # filename = "images/#{title}"  
  filename = File.join(File.dirname($0), "images/#{title}")  
  
  unless doc.nil?  
    # 创建目录  
    Dir.mkdir(filename) unless Dir.exist?(filename)  
    # 匹配页面中的图片  
    urls = doc.xpath('//*[@itemprop="articleBody"]/p/img/@data-xkrkllgl')  
    # 将url添加进队列  
    work_queue = Queue.new  
    urls.each { |img_url| work_queue << img_url }  
  
  
    workers = (1..urls.size).map do |i|  
        Thread.new(i) do  
          begin            
	          while (img_url = work_queue.pop(true))  
	              begin  
	                p "下载图片：#{img_url.content}"  
	                # 读取图片数据，设置超时时间为3s  
	                raw_data = HTTP.timeout(3).get(img_url.content).body.to_s  
	                sleep 0.1  
	                # 解密保存  
	                raw_data = aes_128_cbc_decrypt(raw_data)  
	                File.binwrite("#{filename}/image#{i}.jpg", raw_data)  
	              rescue Exception => e  
	                p e.message  
	                next  
	              end            
	            end          
          rescue ThreadError    
          end  
        end    
    end    
    workers.map(&:join)  
  end  
end  
  
  
def start_crawl  
  page_index = 1  
  loop do  
    begin      
      url = "#{BASE_URL}category/wpcz/#{page_index}/" # 今日吃瓜页面  
      p "正在抓取#{page_index}页，地址：#{url}"  
      doc = get_page_doc(url)  
  
      fetch_list_urls(doc).each do |page|  
        fetch_page(page['title'],page['url'])  
      end  
      # 匹配下一页按钮  
      next_page_xpath = '//*[@class="page-navigator"]/ol/li[@class="btn btn-primary next"]/a/text()'  
      # 退出抓取的条件  
      break if  doc.xpath(next_page_xpath)[0].content != "下一页"  
      # 抓取下一页  
      page_index += 1  
      sleep 0.1  
    rescue Exception => e  
      p e.message  
      page_index += 1  
      next  
    end  
   end
  end  
  
# 执行抓取方法  
if __FILE__==$0  
  start_crawl  
end