Python + pyspider某小说站的爬虫，入数据库，火车头发布，资...

fei 发表于 2019-6-8 23:06:07

Python + pyspider某小说站的爬虫，入数据库，火车头发布，资源下载到本地，另可写爬虫！
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-05-05 21:43:11
# Project: XiaoShuo

from pyspider.libs.base_handler import *
import pymysql
import random
import datetime
import urllib2,HTMLParser,re
import os
import sys
import re
import codecs
import requests
import json

class Handler(BaseHandler):
global Datos
global P_dir
P_dir = '/Tools/Debug/'#采集时候图片保持到本地的路径
global Datos
Datos = {}
headers= {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
}
crawl_config = {
   'headers' : headers,
   'timeout' : 300
}
def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
   db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
   try:
         cursor = db.cursor()
         #注意此处字符串的占位符要加双引号"%s"
         sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
#          print(sql)
         cursor.execute(sql)

         #qid = cursor.lastrowid
         #print(qid)

         db.commit()
   except Exception as err:
         print("Error %s for execute sql: %s" % (err, sql))
         db.rollback()
def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
   db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
   try:
         cursor = db.cursor()
         #注意此处字符串的占位符要加双引号"%s"
         sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
#          print(sql)
         cursor.execute(sql)

         #qid = cursor.lastrowid
         #print(qid)

         db.commit()
   except Exception as err:
         print("Error %s for execute sql: %s" % (err, sql))
         db.rollback()
def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
   db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
   try:
         cursor = db.cursor()
         #注意此处字符串的占位符要加双引号"%s"
         sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
         print(sql)
         cursor.execute(sql)
         print(cursor.lastrowid)
         db.commit()
   except Exception as err:
#    except:
#          print('Failed')
         print("Error %s for execute sql: %s" % (err, sql))
         db.rollback()

def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
         reload(sys)
         sys.setdefaultencoding("gbk")
         locoy_url = 'http://www.******.net/locoy/?my=book'#697火车头发接口地址
         locoy_data = {
         'my_u':'用户名', #后台用户名
         'my_p':'密码', #后台密码
         'subject_669977_net':Bookname.encode('gbk', 'ignore'),
         'caid':Cater_Name.encode('gbk', 'ignore'),
         'title_669977_net':Booktitle.encode('gbk', 'ignore'),
         'article':BookConte.encode('gbk', 'ignore'),
         'author':Book_author.encode('gbk', 'ignore'),
         'ready_1':Book_Palabras.encode('gbk', 'ignore'),
         'thumb':Book_img,
         'content':Book_Introduction.encode('gbk', 'ignore'),
         'abover':abover.encode('gbk', 'ignore')
            }
         res = requests.post(locoy_url, data=locoy_data)
         print res.text
         print res.content
#          print Dsd
         return res

def __init__(self):
   self.base_url1 = 'https://www.****.cc/'
   self.base_url2 = '/'
   self.CaterId = []
   self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
   self.page_num = 1
   self.total_num = 200

@every(minutes=8 * 60)
def on_start(self):
   global Cater_Name
   Cater_Name = []
   while self.page_num <= self.total_num:
         for self.CaterId in self.CaterIds:
            if self.CaterId== 'xuanhuan':
                  Cater_Name = '玄幻'
            if self.CaterId== 'wuxia':
               Cater_Name = '武侠'
            if self.CaterId== 'lishi':
               Cater_Name = '历史'
            if self.CaterId== 'yanqing':
               Cater_Name = '都市'
            if self.CaterId== 'nvsheng':
               Cater_Name = '都市'
            if self.CaterId== 'kehuan':
               Cater_Name = '科幻'
            if self.CaterId== 'kongbu':
               Cater_Name = '游戏'
            print self.CaterId
            url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
            self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
         self.page_num += 1

def list_Caterg(self, response):
   Cater_Name = response.save
   for each in response.doc('.pic-list a').items():
         self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)

def list_Caterg_detail(self, response):
   Cater_Name = response.save
#    print Cater_Name
   Bookname = response.doc('h1').text()
   print Bookname
   Book_author = response.doc('.authorname > a').text()
#    print Book_author
   Book_Introduction = response.doc('.book-intro > div').text()
#    print Book_Introduction
   Book_Synopsis = response.doc('b').eq(1).text()
#    print Book_Synopsis
   Book_Palabras = response.doc('.booktitle p').text().split(' ').split('|')
#    print Book_Palabras
   BookIDs = response.url.split("xiaoshuo/")[-1].split("/") #小说ID
#    print BookIDs
   Book_Dates = str(datetime.datetime.now())
   for imgs in response.doc('.bigpic > img').items():
         img = imgs.attr.src
         print img
            #小说封面下载
         extension = self.getExtension(img)
         name = self.getname(img)
         file_name = name + "." + extension
         imgDir = P_dir + name
         Locaimg = imgDir + "/" + file_name
         print Locaimg
         if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译，图片下载到本地
            print('attachment url is ' + img)             #
         Datos = {
               "Cater_Name":Cater_Name,
               "Book_author":Book_author,
               "Book_Introduction":Book_Introduction,
               "Book_Synopsis":Book_Synopsis,
               "Book_Palabras":Book_Palabras,
               "img":img,
            }
         self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates)#这行可注译，数据库发布接口，方便其他系统的发布
   for each in response.doc('diva').items():
         self.crawl(each.attr.href, callback=self.index_page,save=Datos)

@config(age=8 * 60 * 60)
def index_page(self, response):
   Datos = {
               "Cater_Name":response.save['Cater_Name'],
               "Book_author":response.save['Book_author'],
               "Book_Introduction":response.save['Book_Introduction'],
               "Book_Synopsis":response.save['Book_Synopsis'],
               "Book_Palabras":response.save['Book_Palabras'],
               "img":response.save['img'],
                  }
   for each in response.doc('.chapter-list li:first-child a').items():
#    for each in response.doc('.chapter-lista').items():
               self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
@config(priority=2)
@catch_status_code_error
def detail_page(self, response):
   NewRe1 = u'哈书'
   NewRe2 = u'huhjsd.CC'
   NewRe3 = r'^\\n\\n'
   NewRe5 = u'小说网'
   NewRe6 = u'fgdfgf'
   NewRe7 = u'fgfgf'
   NewRe8 = u'ffhgf'
   NewRe4 = r'[\f\t\v+\.\{\（\）\}\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）]+'
   ReC1 = u'静思'
   ReC2 = u'aghgf.com'
   ReC3 = u'aghgfh.com'
   ReC4 = u''
   ReC5 = u'文学网'
   ReC6 = r'<BR>'
   Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
   print Bookname
   Cater_Name = response.save['Cater_Name'] # 小说分类
   Book_author = response.save['Book_author'] #小说作者
   Book_Introduction1 = response.save['Book_Introduction'] #小说简介
   Book_Synopsis = response.save['Book_Synopsis'] #最近更新
   Book_Palabras = response.save['Book_Palabras'] #小说字数
   Bookurl = response.url #小说网址
   Booktitle = response.doc('.article-title').text() #章节名称
   BookID = response.doc('.readset-r span').text() #小说ID
   BookConte1 = response.doc('.article-con').text() #小说章节内容
   abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
   Book_Date = str(datetime.datetime.now()) # 采集时间
   BookConte2 = BookConte1.replace(NewRe1 , ReC1)
   BookConte3 = BookConte2.replace(NewRe2 , ReC2)
   BookConte5 = BookConte3.replace(NewRe5 , ReC5)
   BookConte6 = BookConte5.replace(NewRe6 , ReC2)
   BookConte7 = BookConte6.replace(NewRe7 , ReC2)
   BookConte8 = BookConte7.replace(NewRe3 , ReC6)
   BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
   BookConte = BookConte4.replace("\n\n","<br>")
   print BookConte
   Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)
   Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
   Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
   Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
   Titleid = response.url.split(BookID + "/")[-1].split("/")
   Book_img = response.save['img'],#小说图片

   #insert into MySQL 小说入库
   self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译，数据库发布接口，方便其他系统的发布
   self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译，数据库发布接口，方便其他系统的发布
   #post提交发布
   self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover)#这行可注译，火车头发布接口，不需要可取消
   Datos = {
               "Cater_Name":response.save['Cater_Name'],
               "Book_author":response.save['Book_author'],
               "Book_Introduction":response.save['Book_Introduction'],
               "Book_Synopsis":response.save['Book_Synopsis'],
               "Book_Palabras":response.save['Book_Palabras'],
               "img":response.save['img'],
                  }
   for each in response.doc('.articlebtn > a:nth-child(4)').items():
         self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
   return {
         "Cater_Name":Cater_Name,
         "Bookname":Bookname,
         "Book_author":Book_author,
         "Book_Introduction":Book_Introduction,
         "Book_Synopsis":Book_Synopsis,
         "Book_Palabras":Book_Palabras,
         "Book_img":Book_img,
         "Bookurl": response.url,
         "Booktitle": Booktitle,
         "BookID": BookID,
         "BookConte": BookConte,
         "Titleid": Titleid,
         "abover":abover,
#          "Book_Date" = str(datetime.datetime.now()),
   }
def download(self, P_dir, imgDir, file_name, Book_img):
   if not os.path.exists(imgDir):
         os.makedirs(imgDir)
   file = imgDir + "/" + file_name
#    print file
   f = open(file, 'wb+')
   imag = requests.get(Book_img)
   f.write(imag.content)
   f.close()
   #保存图片前
def save_imgs(self,response):
   content = response.content
   file_name = response.save["file_name"]
   imgDir = response.save["imgDir"]
   file_path = imgDir + file_name
   self.save_img(content,imgDir,file_path)
#保存图片
def save_img(self,content,imgDir,path):
   if not os.path.exists(imgDir):
         os.makedirs(imgDir)
   f = open(path,"wb" )
   f.write(content)
   f.close()
#获取url后缀名
def getExtension(self,url):
   extension = url.split(".")[-1]
   return extension

#获取图片名
def getname(self,url):
   name=url.split("/")[-1].split(".")
   return name

页: [1]

飞逸社区's Archiver

Python + pyspider某小说站的爬虫，入数据库，火车头发布，资...