Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!7 H8 o) F2 t3 O5 @/ R) ^
- #!/usr/bin/env python
8 L" u/ B0 ~4 `6 o! Q - # -*- encoding: utf-8 -*-) r9 k) u6 A& s6 E( q. s! a
- # Created on 2019-05-05 21:43:11( O3 e O$ ]! T* u
- # Project: XiaoShuo
9 W1 g" F0 q( j" @, A0 w - - f+ p. P0 G7 b; V6 [" ~& k3 {: y
- from pyspider.libs.base_handler import *. L) K2 }2 ^. R' `& _8 G
- import pymysql
+ A# A8 a1 s8 o9 `# v# s - import random0 C. S5 O+ _$ A# e. b
- import datetime: H) d! B. W, t: [* s. L
- import urllib2,HTMLParser,re
h4 c% G |/ z- t - import os
. T( B5 v$ r: ?7 C. Y- C6 S4 F/ r1 y - import sys
; ~, H, Z3 T. q4 p. M. L4 A - import re
! c2 c! s/ k8 ^6 h. q$ W3 k - import codecs# c. S& K5 n- X0 h9 [
- import requests6 l+ c Z# Z' F1 ]3 j$ r8 U4 v- ^
- import json) ]! I4 U4 V3 C: P2 \. {
-
) B& f" p1 B; \5 j - class Handler(BaseHandler): X. ]# d' \; i z: a
- global Datos
$ z0 a' }" F) A9 ? - global P_dir
+ ^% e5 T6 I2 c2 l* k7 z6 ~ - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
! T4 a1 w. q4 y# r' A. p - global Datos
1 O6 r8 K) J' Q. u8 f - Datos = {}
# M- w( Y( e* ~- U - headers= {# \+ t" u* ^/ `# W1 o& J
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',* r* F. a' t" {# X% t% o
- 'Accept-Encoding':'gzip, deflate, sdch',
4 i) g. \1 J/ P0 H7 S1 @ - 'Accept-Language':'zh-CN,zh;q=0.8',2 n; B! y. x" F9 H4 A
- 'Cache-Control':'max-age=0',
3 |! J, t' |) G2 [0 k, K - 'Connection':'keep-alive',
3 u2 F2 r& J1 ]: G$ F' g! y - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
& a& r+ J' o& [1 D - }# u+ ~, T" _0 V$ }
- crawl_config = {
% w5 a- v ^9 I! a2 ~& A - 'headers' : headers,1 g/ |9 V. c7 x8 q
- 'timeout' : 300' z# T$ B: s( F. _" A4 y7 D& K- q& D
- }
! E' K5 l$ L. p, [9 J - def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):; \8 Z5 R$ p, t
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
% [* l1 h% d7 I4 x& Y - try:. I0 [4 c- U( w" L0 u4 i
- cursor = db.cursor()
7 Y2 c" C" E! F5 M7 }+ R0 W. U8 _5 K - #注意此处字符串的占位符要加双引号"%s"
% s( R. i, X9 v" G' M - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);5 j7 g4 r/ R: ^3 i$ \
- # print(sql): U9 i Y' j0 W$ j7 L9 @
- cursor.execute(sql)
' T; S$ v5 `0 c+ O -
4 q+ U) y8 I8 C( }3 l: H; L - #qid = cursor.lastrowid
2 {2 ^3 R- x. k6 ]( A1 I - #print(qid)
2 L5 _) J3 C" m0 u( F - : ~0 K$ u* C+ L& a6 w( Q; P
- db.commit()& I" U9 P- u' A) V, n0 L- J' V% P+ H
- except Exception as err:
' S' Q9 a& m4 y, W/ Q# l5 G - print("Error %s for execute sql: %s" % (err, sql))" F1 Y8 ~7 p. W; _" q
- db.rollback()( Z5 J9 ^* q" o. h p
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
0 W6 O; [! s$ B) |/ _ - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")9 {" \( ]4 F2 a0 F. Q
- try:
: ?: c3 X7 M$ }6 N: S& b0 d - cursor = db.cursor()
* ~6 V U* k, d8 j5 A1 l, x - #注意此处字符串的占位符要加双引号"%s"
* m2 N+ U" `/ P' Z: x, K - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
7 Y* u* C- z; E4 ^, Z - # print(sql)7 F! v# m* o4 Z1 ]
- cursor.execute(sql)# U" C( b( v' ?% k
- / J6 m j( d0 T2 r5 g
- #qid = cursor.lastrowid
$ A# N L5 W& u+ A9 Y, I+ E - #print(qid)" W3 _3 s5 n! [; x
- ) o# w2 B J* f0 x/ @
- db.commit()
( k- @- h! ~3 C+ { - except Exception as err:% a4 @5 A- Y' z" Y8 W$ |& r' n
- print("Error %s for execute sql: %s" % (err, sql))
1 |/ |. r! l3 r5 [ - db.rollback()
2 W" w$ b+ b/ o4 w1 N - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):$ V9 |9 |8 A8 f+ s {5 K9 V" `
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")( J0 U, H4 ]% q7 n! l s
- try:6 j/ E m; x1 y2 ?; h2 }
- cursor = db.cursor()
1 L+ f. m# m! u* u - #注意此处字符串的占位符要加双引号"%s"$ ?, X: d! t2 E
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);5 W0 z' l7 A( x; W9 z7 Q% ^: ]
- print(sql)) x- Y9 U% |! H" u, @ h
- cursor.execute(sql)7 l8 t2 s$ j6 P: h
- print(cursor.lastrowid) L) W6 E6 r7 d P, ]" ?
- db.commit()2 v. G! t) q4 Y0 u% T C: K9 S6 ^
- except Exception as err:4 c- @8 A, Y- v& y6 R' h
- # except:
( H+ C0 e& }4 c. @2 X& @5 \ - # print('Failed')$ S8 H9 E( N& n$ t
- print("Error %s for execute sql: %s" % (err, sql))
* q$ N( ]$ E! p) [& L7 h5 H - db.rollback()9 X9 |0 `) Y1 U& k+ C ^
- , ^, n; n/ i! _+ u& A
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
! P% |& @& z3 F/ o" B. Q - reload(sys)
; O5 Q. a0 t3 i0 y3 D - sys.setdefaultencoding("gbk")2 A. D6 K0 g& B8 ~
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址$ ^" i7 z9 ]' e
- locoy_data = {! n9 k8 H. I$ z8 x
- 'my_u':'用户名', #后台用户名
6 K2 W5 ~6 c$ I, _, b$ Z0 H' Y9 f' d - 'my_p':'密码', #后台密码' _& k/ E! m, L, E+ J2 N; O
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),7 Q% P! R" h! V0 p
- 'caid':Cater_Name.encode('gbk', 'ignore'),# D8 S. \: B' C; p1 {- F7 y
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),/ D' t6 t: {# h
- 'article':BookConte.encode('gbk', 'ignore'),
- l8 o+ C, t% v7 ~ - 'author':Book_author.encode('gbk', 'ignore'),' R7 O3 d+ ^( t1 u* A1 n8 S
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
( d4 i4 V7 A) \& K, A+ x2 d! d$ d# u - 'thumb':Book_img,
3 h2 b J3 L, ] - 'content':Book_Introduction.encode('gbk', 'ignore'),
$ m5 V$ x" F, b7 J" q3 s - 'abover':abover.encode('gbk', 'ignore')
+ t5 j* D+ f: ?- O* `7 l, E - }
2 I) ?7 g" \# | - res = requests.post(locoy_url, data=locoy_data)/ X) |- C$ G# V/ y1 w9 a; m
- print res.text
/ ~% k4 ?. j/ }9 U6 s - print res.content
$ f# j5 Z, g' _$ S2 M - # print Dsd6 F5 {( g9 ^" N% Y7 `0 R
- return res
( ]+ _1 D2 ]- M7 \; M" { -
) [4 ?( n% Q( @( H - def __init__(self):
* C: F( R& F4 {/ ? g - self.base_url1 = 'https://www.****.cc/'
* R; U. \+ v; g - self.base_url2 = '/'2 t2 B( Y- N8 q- ?# S. d2 T6 Y
- self.CaterId = [] x: y1 a# P# i
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']% z# u3 g5 p0 b$ f
- self.page_num = 1* ^) I8 W/ k% ?4 {4 [
- self.total_num = 200
' S ~* m. L" p -
( D) ~5 \% x+ m2 O t - @every(minutes=8 * 60)
& A/ H& ?0 z. o) \ E4 x6 r - def on_start(self):
( u9 g7 U* Q) ~- d4 f - global Cater_Name% h( ^% V. e, F0 o# X& S$ m* h$ \
- Cater_Name = [], P. t0 h5 c' A
- while self.page_num <= self.total_num:
; X% ^. C* D. u. g# `4 ~ - for self.CaterId in self.CaterIds:
( i) K0 R4 ^* y! x1 S' Q6 u - if self.CaterId == 'xuanhuan':
1 [6 ?+ j9 o5 F6 o2 D2 ?1 K - Cater_Name = '玄幻'' c- X# `! F& h0 U5 t
- if self.CaterId == 'wuxia':+ U) U$ d& R6 l! F0 D! c" q3 f* r
- Cater_Name = '武侠'
( a B1 g0 }7 z2 U% k - if self.CaterId == 'lishi':. h) L( Z8 `- `; _
- Cater_Name = '历史' " K4 ^( ` a9 y, i0 i
- if self.CaterId == 'yanqing':
# [% L( D1 m4 o: Q; O. x! f P - Cater_Name = '都市' ; W3 r1 B, R( `0 V8 T5 b
- if self.CaterId == 'nvsheng':. F8 d0 E0 @' r" N6 Y
- Cater_Name = '都市' ) W; Z* n9 F+ i& [, x: `
- if self.CaterId == 'kehuan':
. h+ q$ ?; w. Z# g7 o0 W - Cater_Name = '科幻'
; Z7 M5 n7 z4 N9 u# Y2 S - if self.CaterId == 'kongbu':3 ?8 u8 d# I, q* ~
- Cater_Name = '游戏'
8 m6 M+ N% q8 {, a) [; M7 j9 S - print self.CaterId
( w. A( O. ?# H$ m; y - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/" 1 g2 M9 G. @- _0 D6 z
- self.crawl(url, callback=self.list_Caterg,save=Cater_Name)2 s) m+ b% G6 q5 ?( S# c0 B# Y
- self.page_num += 1 ( C' v% |# J+ a4 g, P9 q) e
-
& z1 S( A, u) D, M3 N - def list_Caterg(self, response):) g4 e% S$ Z6 P8 u) l: J# P
- Cater_Name = response.save
2 K# l0 v7 X$ d" J& B - for each in response.doc('.pic-list a[href^="http"]').items():. C x: R! l* P/ r6 ?
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
0 i. X6 g* e& s( o- g- o0 y7 }( U - 5 h' P" P+ x) K. D
- def list_Caterg_detail(self, response):
9 | V- |6 Z5 T - Cater_Name = response.save
- f: _: W" u7 S9 f7 t - # print Cater_Name# N3 M5 [2 I* ~9 c0 N
- Bookname = response.doc('h1').text()& b$ v. r8 J; R1 ^ i1 r/ H1 p
- print Bookname
7 P% Y- A. y. q) m$ [* X - Book_author = response.doc('.authorname > a').text()
" H B/ q+ L& @( h& P. T - # print Book_author- ?2 X6 M" c; `4 ^1 ~* F
- Book_Introduction = response.doc('.book-intro > div').text() m0 _' _) p$ W/ E4 L* ^
- # print Book_Introduction
- ]: \ ~2 p8 W I/ v" o - Book_Synopsis = response.doc('b').eq(1).text()
# x$ D; S" [5 [) O; _ - # print Book_Synopsis
" r8 d- W% \. W: T2 G - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]- y8 J! }: \) J& E7 o8 [: E
- # print Book_Palabras
0 j0 d5 l- B0 N' G8 v7 u) t - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
. V% l% Q' @7 T" k& E3 K6 X0 [ - # print BookIDs
4 t \3 ?( n6 {! C- ~! C& T - Book_Dates = str(datetime.datetime.now()) / Q& M" k* x, R1 i
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
& [' `, k, I2 m9 u' Q& m - img = imgs.attr.src$ Y2 N* h& o' R4 v0 p4 t% _" q
- print img* Q- x4 E1 ?$ ^& B
- #小说封面下载
' O6 c/ ~* b6 e. Q8 q% ^5 C - extension = self.getExtension(img)9 K1 Q A* s, r5 v" \) y' }9 t$ c
- name = self.getname(img)
' G1 d+ r( `9 _* H* \* I - file_name = name + "." + extension
% G& p& E- Y j6 O6 j1 g5 F2 t - imgDir = P_dir + name( [3 h' R& w/ T
- Locaimg = imgDir + "/" + file_name1 H8 T9 i) p% C* E
- print Locaimg
4 w& c2 i$ B# Q1 W& w! S - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地) ]4 ~6 U" w( q6 p) G' d& h
- print('attachment url is ' + img) #0 x# Q: ?7 \1 H& q6 o1 i* L/ A
- Datos = {
* I# l! z5 O. l. J6 e - "Cater_Name":Cater_Name,
: Q' j5 \$ r5 P' w- ^ - "Book_author":Book_author,
" Q1 u5 m& _* Q9 R. K3 U, h - "Book_Introduction":Book_Introduction,2 V" r8 D: Q3 f' B/ U" L
- "Book_Synopsis":Book_Synopsis,/ g' y) v( N0 e S7 @9 G& F6 \3 e
- "Book_Palabras":Book_Palabras,0 b4 a3 t# h: t8 W" V
- "img":img,
8 A+ M! ~% H: Y. q, t0 e$ O4 l - }8 u" y0 p4 K" X; {' n% V, t9 Q9 o
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布* e6 j% u4 `( H) g6 L* l" g
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():" |+ o) M% x/ L+ ~7 ^
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)# O+ I7 T0 ~$ s& }7 l4 T
-
% g. U# d" N3 u4 q$ u; i5 V2 i" q - @config(age=8 * 60 * 60) $ u+ C: Y3 w. P4 d& C- x
- def index_page(self, response): ; i2 X7 y' v: Y# g+ i
- Datos = {
# L4 d- z! f' z+ a; R F - "Cater_Name":response.save['Cater_Name'],$ c$ i/ F' ~. Z* p3 I8 Y: \ n% b
- "Book_author":response.save['Book_author'],
5 Y" d. ~$ ]+ x. M$ R) v5 n: |, }( \ - "Book_Introduction":response.save['Book_Introduction'],' J) O+ k6 q7 i/ G) M e: h
- "Book_Synopsis":response.save['Book_Synopsis'],* q! ]% K# c: z3 }3 O9 f) g
- "Book_Palabras":response.save['Book_Palabras'],7 S2 a0 { U5 D: u
- "img":response.save['img'],$ S" J( s( I8 o& [! ], o- J9 q
- }
7 \; \7 a$ |, }1 C7 K - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():1 R. S& G8 P; v7 f2 k
- # for each in response.doc('.chapter-list a[href^="http"]').items(): . `, x! \( C' T4 `; N" n6 ^
- self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
0 r$ }, Y. p+ p1 A# P6 o - @config(priority=2)
4 i4 o% f, `6 r1 i8 _" k - @catch_status_code_error
5 {9 L/ I# f1 k! O" t, B - def detail_page(self, response):
$ c# J$ }/ l& _( [8 o" c/ |, v - NewRe1 = u'哈书'* ]5 {/ r2 S6 b* D, ~; Q% @
- NewRe2 = u'huhjsd.CC'+ w$ [' M# s, l" T( }/ s' O1 ~
- NewRe3 = r'^\\n\\n'* `: [" c( M# y2 D
- NewRe5 = u'小说网'
e$ n6 Y) [9 x# _( f0 h4 e" J% Q - NewRe6 = u'fgdfgf', k) j. @+ H1 w3 T) o3 J
- NewRe7 = u'fgfgf'
* ?! [ H8 I6 n8 F. ?! d - NewRe8 = u'ffhgf'% q0 n7 A: x. O, {2 K
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
2 _3 y5 ]7 v' `4 v - ReC1 = u'静思'
* c) x) V( h* j! d' v- Z6 Q - ReC2 = u'aghgf.com'
1 ^6 K9 P) }8 [ - ReC3 = u'aghgfh.com'0 j( v; G4 N: P+ e# u) Z
- ReC4 = u''
+ t% x# t7 O. E# K6 V - ReC5 = u'文学网'
% I$ }( T* R& E1 @# B - ReC6 = r'<BR>'( I' B+ }: z0 Z% p
- Bookname = response.doc('.readlocation a').eq(2).text() #小说名称" Q. h6 r D: y7 B% S) r9 U7 u
- print Bookname. i! {* `8 U3 J! B+ [& X
- Cater_Name = response.save['Cater_Name'] # 小说分类
+ i" i7 `# M( h' k* u) A/ P, u4 N& e5 N - Book_author = response.save['Book_author'] #小说作者$ C! ^$ Y4 d( A7 Q' t! t! ^
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
) L8 _( a3 M- [2 Z! W8 P - Book_Synopsis = response.save['Book_Synopsis'] #最近更新
0 Y9 m' T3 E3 D, T0 p7 i7 u - Book_Palabras = response.save['Book_Palabras'] #小说字数! E+ K9 g" P( G9 p
- Bookurl = response.url #小说网址& `+ n5 l2 o" G5 Z: \
- Booktitle = response.doc('.article-title').text() #章节名称
& w5 x% h% f1 |- q8 u2 C - BookID = response.doc('.readset-r span').text() #小说ID
$ p& c2 r! | y; g5 J' K' h( y - BookConte1 = response.doc('.article-con').text() #小说章节内容8 b4 b/ t' A+ F% S
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)+ V6 V6 Q; m8 o2 o/ L
- Book_Date = str(datetime.datetime.now()) # 采集时间
- K" s8 k8 U6 J5 l. J( ~ - BookConte2 = BookConte1.replace(NewRe1 , ReC1)) g* {' B* J$ W+ E1 C7 \2 N# T
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)0 k G7 {3 O ]! H
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)$ P3 W: M# R8 f+ d% h# c
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
) y, l v# X; P! @6 J5 k - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
6 J# m! }* W+ G) v% [ - BookConte8 = BookConte7.replace(NewRe3 , ReC6)/ z* ~- s6 ]7 C; M8 x* O% q' E
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
* M0 ]2 x$ n8 H* }3 Y7 q! l { - BookConte = BookConte4.replace("\n\n","<br>")4 h3 b: {* _6 U
- print BookConte
6 k. Y6 Z( L% T* o7 U8 Y - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)$ N M, _3 |( B2 O5 V" \
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
# e" B, t1 c' | - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)* c& O6 ?' i# q/ K5 C
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)7 O% \) Y4 ~. O- E" b- x
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] 1 m7 [5 y, M7 D8 a9 t. P7 S! X" L
- Book_img = response.save['img'], #小说图片
) K h, n' M$ k - & t# l4 \4 k1 b d
- #insert into MySQL 小说入库
( \$ c# J- w; q3 [1 t: u* o8 Z - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
, v! Q; M) E4 q: e$ F% P& w+ i - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布. q0 U5 r, ^4 [2 f
- #post提交发布& V! j' v7 w+ Q. e
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消, c* x3 J# f4 y2 K+ G
- Datos = {
( m6 V4 F2 n. I0 y3 m+ C* G. ? - "Cater_Name":response.save['Cater_Name'],
/ S U; v) o5 y' v2 C* }/ N, G - "Book_author":response.save['Book_author'], j6 x( l* r) q+ y% G
- "Book_Introduction":response.save['Book_Introduction'],
) d" V5 e$ q) t' a7 m2 m - "Book_Synopsis":response.save['Book_Synopsis'], ~) w6 ~0 Q' a' S: H4 I ~; p
- "Book_Palabras":response.save['Book_Palabras'],
0 s1 G2 a9 J3 O5 B. G - "img":response.save['img'],
+ g f, K: ?; F d& O/ j - }
% O# D1 [0 D0 y0 X b; { - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
, G; H* D: c6 a/ C. E& b/ t: Y - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
" u6 s" N& m+ G, A - return {
! Q6 S% ~/ z% N3 k6 a - "Cater_Name":Cater_Name,
0 I( t* L% r6 D7 M1 y3 k - "Bookname":Bookname,* P& S+ o9 Q, s
- "Book_author":Book_author,' J- S0 f& y J' y/ F: {) Y
- "Book_Introduction":Book_Introduction,
" Y6 x9 s' K: o) A& g1 l9 g' \6 ] - "Book_Synopsis":Book_Synopsis,/ ~4 o" ~ R" f/ G2 u9 t/ L
- "Book_Palabras":Book_Palabras,8 w2 ~) Z3 w5 c; W6 m* e4 n* y2 q
- "Book_img":Book_img,* V, Q% y* ~8 y
- "Bookurl": response.url,% w- T) R/ v' Y a& N8 r
- "Booktitle": Booktitle,/ I6 q1 G" Z: \; w/ t- t
- "BookID": BookID,
! |0 ~4 D, F- B: y& t) i - "BookConte": BookConte,6 |3 R" Y! t8 u1 l
- "Titleid": Titleid,
0 i2 Q9 b4 b b - "abover":abover,% R0 b1 r) u& P: f' P1 H# ~3 I
- # "Book_Date" = str(datetime.datetime.now()),
9 H# X! V! t5 O( W) t% H0 H8 ^ - }
4 I; c+ |8 N8 j8 w) u& n - def download(self, P_dir, imgDir, file_name, Book_img):4 _* ^# {+ D! B1 Z4 @
- if not os.path.exists(imgDir): 9 e, B* |+ G& ?$ m( L
- os.makedirs(imgDir)7 p8 }; y& }9 v# _ L6 ^
- file = imgDir + "/" + file_name
' ^6 [; j+ C; o8 w. s9 y - # print file
2 c3 p; X5 e+ S- m1 K) J3 x' Z0 O - f = open(file, 'wb+')
, i: Y6 l" V: b( V3 c: ~, K - imag = requests.get(Book_img)
! c1 S0 A" H5 n. P, Y+ x - f.write(imag.content)7 a9 w# b! H' u3 g8 B
- f.close()
9 Z. m) ^- ]0 D. `; R - #保存图片前: T& }& j% C1 k) ^5 r+ T
- def save_imgs(self,response):* i) t N4 F9 m+ U/ _
- content = response.content
7 G/ Y$ q }. h - file_name = response.save["file_name"]! i" W' `8 q7 G
- imgDir = response.save["imgDir"]9 Q7 G, F; ^/ l0 E% m
- file_path = imgDir + file_name
+ z; y6 |. J' ]3 g* H - self.save_img(content,imgDir,file_path)2 f; P8 P. \ P0 t
- #保存图片- ]- [: }$ i4 ^ t5 q9 U+ S
- def save_img(self,content,imgDir,path):: v4 N! Y( h/ ]- d+ U
- if not os.path.exists(imgDir): # i2 n* S" F5 l9 e, P1 i% H( T+ x
- os.makedirs(imgDir)
m3 `' m' N/ j0 P+ @0 _+ X - f = open(path,"wb" )! X S) R$ F) e. p
- f.write(content) w- x ~) U- u, u7 e& j8 \% U
- f.close()
0 E0 c6 l. D8 f5 Y2 m - #获取url后缀名$ \; E- A6 J* ^4 ?
- def getExtension(self,url): / Z' b: ]6 U6 D$ y: E! f5 k
- extension = url.split(".")[-1]
; R+ g1 B5 {; A# d& |; T3 j% X0 c - return extension
. p% C1 L* n1 V! c& e; U -
6 y4 d3 D( A$ `0 k/ V - #获取图片名; ^% J# [6 a& n- R" `
- def getname(self,url):( |9 H( T8 A" x" A
- name=url.split("/")[-1].split(".")[0]; h7 ?* w+ S+ f" O* b' s6 Z& `" F
- return name
复制代码
5 T* @3 ?0 [9 l- M
. S; v( j; m+ L; D/ g |