Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
; W; k5 }1 |1 h1 a- #!/usr/bin/env python. y! ?6 z5 r3 i& w0 e. P
- # -*- encoding: utf-8 -*-# O R' c( b# k& q* F
- # Created on 2019-05-05 21:43:11- Q, R- `, W+ P0 s
- # Project: XiaoShuo
3 F0 s5 y" ^" v f+ {6 }8 I1 t - & R$ z( S" K6 S# r' m$ |
- from pyspider.libs.base_handler import *
& ?3 h. c& Y1 ?; x9 P - import pymysql5 v* {; M/ N: k3 n/ E
- import random& F/ m* E( @% Z, p" @$ X/ x
- import datetime5 C7 k- v) T/ z
- import urllib2,HTMLParser,re' _: k4 D8 Y6 G, k
- import os
% I0 d& T0 v# Q0 D$ q - import sys
. }$ a3 L( w, p1 j9 |1 U# o0 M - import re; R3 [0 i/ |7 N! [6 P- W5 [, ]% Y
- import codecs
4 V" P% k- b$ a; J, n* I: L, d" S - import requests
' d: m' o$ u7 Q6 A+ z/ C - import json$ @1 W8 q5 n/ f% [
- + v8 r8 E0 a5 T4 _9 k9 R) y
- class Handler(BaseHandler):
; A: c1 x) B8 i( j - global Datos
; Z9 M) Q7 N7 k2 x - global P_dir
: M. K0 L. J: @# m' T; z6 L - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
6 Y. j$ X$ n7 \3 \( y0 ?- h - global Datos4 E# a4 {7 \+ ~3 P- R
- Datos = {}
6 D* _6 [, o) a4 U- ^* a - headers= {- R! h' ~& ?$ h _7 e0 j3 j
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
6 \) v$ b3 Q3 h8 _ - 'Accept-Encoding':'gzip, deflate, sdch',
3 Q5 k( F8 f! ?+ M - 'Accept-Language':'zh-CN,zh;q=0.8',
7 a$ v) P) E- c! @& | - 'Cache-Control':'max-age=0',
6 Y$ E7 ~+ U1 U - 'Connection':'keep-alive',& y5 B" B' d6 z1 N
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
p; v1 c9 |; M* e. b& s - }* I7 C, ]5 s% i6 o
- crawl_config = { z s6 }! C/ s/ q1 c! g) ^
- 'headers' : headers,
4 I3 F+ y$ M& }* v - 'timeout' : 300
8 _: B4 n/ R3 b: B5 K - }0 p0 K1 I/ a- t; G( }
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
6 E: ~& G- ]" c1 \' H1 a# i - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
1 m" x9 Z$ e) G$ n - try: d4 h4 X O* |8 w
- cursor = db.cursor()
8 o' U* o; p0 E6 j - #注意此处字符串的占位符要加双引号"%s"
* U- r* x4 p9 t9 T5 v - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);2 x- \* ^1 l! E
- # print(sql)' E# [. O9 c$ P U, A" e
- cursor.execute(sql)
6 m' M+ n) f' S' p. { - * ^) G; R3 {) K
- #qid = cursor.lastrowid
/ w: b" G3 i3 u# ?. D4 I8 Q - #print(qid)
$ }# `$ Z7 H7 E+ ? -
! w+ b' L2 F1 l+ h - db.commit(). t- X, O6 L- {( ?- w7 K
- except Exception as err:& s- M( e: x) q3 z$ c# S
- print("Error %s for execute sql: %s" % (err, sql))$ x9 U( C5 g% ]& z4 E2 h- w
- db.rollback()
4 t( G- }+ }& T - def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):2 n- e+ [1 Y- E& r2 ^
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
9 |) }' ^6 e6 c# y1 | - try:
( a: w& o3 b7 N% D. v' l - cursor = db.cursor()& _$ K; b% e @ ^) k e7 j# c1 q
- #注意此处字符串的占位符要加双引号"%s"
, p) Q+ h! B. a2 Q: D8 y! w5 } - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
, }+ M, h- c; H: F! ] - # print(sql)
" y2 l9 d8 L! o8 ~" x9 S - cursor.execute(sql)
0 g# G# m3 r& o2 C, B9 ^ -
- p; L1 o+ V- l! f: Q2 g - #qid = cursor.lastrowid9 }9 V+ K: t0 W/ b0 G$ ?7 `/ I( d6 D
- #print(qid)9 x1 c# a- b h& T2 B# a5 {
- 2 a: ?/ k6 B, B4 t6 A
- db.commit()/ r z' O! X- }0 n- Q4 o7 _
- except Exception as err: Z: [% m% H: a6 H6 O5 Z
- print("Error %s for execute sql: %s" % (err, sql))3 S" |9 ]9 z: s4 v* w' Z
- db.rollback()5 o j# Z9 d* _; W
- def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):
k, K/ T+ A. ~! F7 G$ q p B - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
; n- z! @; C+ \- q7 u" z - try: E, \) ~( M0 }- j8 j8 k- h6 o
- cursor = db.cursor()8 l9 X4 P" f1 M6 W4 Y
- #注意此处字符串的占位符要加双引号"%s"' h6 X9 K% l# R j0 O6 Z, l F
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);' D" ~' g+ \/ o2 @
- print(sql)
( b2 E' S% ^& B; d# T2 r% M* A& t - cursor.execute(sql)0 h( r( @; D& ^! t8 g
- print(cursor.lastrowid); ?; {2 d% y7 I$ Z0 u( F
- db.commit(). E9 d' `3 M0 k1 w# |; Z
- except Exception as err:1 m/ O: Z0 J; ^& H9 k6 t- n2 z
- # except:
+ e& y5 o' { Z+ q' C - # print('Failed')
4 Z; N( W. \* ]# V - print("Error %s for execute sql: %s" % (err, sql))
% R! t! D8 I6 W" V! ]0 W- h - db.rollback()4 G6 p3 F1 \% Q) K3 O
- " i4 m m1 S7 y( F) ?8 H
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): " Z) p4 V# }3 c; c! I8 S" \
- reload(sys)
$ @. z- X8 W# |# s+ F5 ?8 i; e - sys.setdefaultencoding("gbk")+ q7 i' I5 V* n* _6 X1 k
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址
\: X, M, L1 Y* s; ]) g - locoy_data = {- [& Y: j& [2 J- k& f
- 'my_u':'用户名', #后台用户名3 \0 }4 U4 S2 K! v
- 'my_p':'密码', #后台密码6 u8 R3 g, o% M% Y6 \
- 'subject_669977_net':Bookname.encode('gbk', 'ignore'),5 x# i' e0 O( v6 M% g. b
- 'caid':Cater_Name.encode('gbk', 'ignore'),$ O J5 g4 e( ]5 T) x$ j
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
" b7 w8 ~6 m7 R% {) k. z- t - 'article':BookConte.encode('gbk', 'ignore'),
8 R' T; U( m% a - 'author':Book_author.encode('gbk', 'ignore'),
( r9 M; L2 g( _5 E. f3 t$ w: R; ? - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),# {% M9 x5 G/ k5 D: T t
- 'thumb':Book_img,
$ a p& `! p! Y- j4 h d3 R8 O, d) w - 'content':Book_Introduction.encode('gbk', 'ignore')," R$ n$ W" Z, _1 ]4 I
- 'abover':abover.encode('gbk', 'ignore')
4 F/ [ E. Z2 b - }
6 k- B) N$ Z7 N# k5 D7 v3 F - res = requests.post(locoy_url, data=locoy_data), {# D4 N" _5 G1 U B
- print res.text
% J& {& x# n! H, U9 w* _ - print res.content7 M4 }' [: ~; {
- # print Dsd
9 l! n, L' S7 ]) H1 r5 L( L - return res& Q. D$ M# T2 ~+ v+ j3 i2 i
- : H' [( N6 {, ]1 H) k. G& x
- def __init__(self):' |" K0 y3 Q+ B% R* ^4 d/ F
- self.base_url1 = 'https://www.****.cc/'
7 R; }! G. i. z0 @" S - self.base_url2 = '/'
3 [4 D4 P, b$ j% G5 h1 ^ - self.CaterId = []
# k5 o* V2 g. J( T - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
( i3 ]$ E+ g5 Q7 Q0 t- [- E - self.page_num = 10 [, D5 E7 i6 P2 ]9 x
- self.total_num = 200
! z# k- n% j9 z+ _2 J" @( I -
8 S" i6 a- a& c' i/ V - @every(minutes=8 * 60)* y: S) h, B' w5 D" R) ~. r; ^
- def on_start(self):
5 R0 B; t# [- Z" Y6 n - global Cater_Name
3 ~* H+ f* |1 m$ d! y7 N - Cater_Name = []8 |8 m% b1 n6 V2 h; s1 _
- while self.page_num <= self.total_num: J; K2 R. q& @, _* [* T7 ~: Y! d
- for self.CaterId in self.CaterIds:
6 {* {/ V- e- Y' Z0 ^& {0 h1 J - if self.CaterId == 'xuanhuan':0 ]) y- B& _/ q- R1 Y/ i
- Cater_Name = '玄幻'
& r8 C% R& H0 G+ \8 N9 H' n - if self.CaterId == 'wuxia':0 W1 I' X* g" _1 k; O
- Cater_Name = '武侠'! X7 z# k9 _ M6 G! T: w
- if self.CaterId == 'lishi':
1 c) v- ?+ B( F* V - Cater_Name = '历史' : m8 r0 O8 X; U+ K
- if self.CaterId == 'yanqing':
9 a4 y- A$ C Q6 M - Cater_Name = '都市'
3 J6 k' `3 W n - if self.CaterId == 'nvsheng':: k; c0 z. |& b
- Cater_Name = '都市' $ ?: I- g, L& B7 c8 A2 B3 }
- if self.CaterId == 'kehuan':
- W' _9 Q9 N) A8 L7 L - Cater_Name = '科幻' % T" I/ ?5 X" J; V2 u# r) H
- if self.CaterId == 'kongbu':* z9 X2 e9 L' ]: u+ C/ M
- Cater_Name = '游戏'
" B" I3 C! j! f& t - print self.CaterId+ D( j2 x4 M# U) z: l
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
# k- L- ~% o! m/ h. n9 ?5 @ - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
1 g! |- W, ]. D2 }9 c$ |! Y - self.page_num += 1 / _# V( y( q) I9 o9 x s
-
2 M# L& P& o) g% _. y* j3 j - def list_Caterg(self, response):7 l9 N) u S8 c" h7 X# u7 Z
- Cater_Name = response.save
4 d1 F* p* |5 T: a5 R - for each in response.doc('.pic-list a[href^="http"]').items():
8 Y% H: O0 _0 X( r9 n3 y - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)
9 d, D% u$ b/ W$ Z8 y - 0 i1 l3 Y: I% b1 Z. P
- def list_Caterg_detail(self, response):! | w/ @/ M" V; y1 ]
- Cater_Name = response.save$ x9 n" u e" E) t
- # print Cater_Name1 j* h) n' D2 A# B( Z e1 s
- Bookname = response.doc('h1').text()
7 Z: ~' F2 k# K+ ^ R- j - print Bookname# C, k; M: U- U7 u
- Book_author = response.doc('.authorname > a').text()4 ?6 N7 G1 e/ L
- # print Book_author6 G: g' W& F2 X X t) ~6 p0 L
- Book_Introduction = response.doc('.book-intro > div').text(); ^# z5 w. S' H8 Z u( {: s
- # print Book_Introduction
" T' a4 `# Y4 r8 g3 H, z - Book_Synopsis = response.doc('b').eq(1).text()
1 p! _, T* y% _1 d3 J1 G - # print Book_Synopsis
+ b% T0 u2 L& @* b - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
8 Q0 \6 O$ [" C) [" [ - # print Book_Palabras# _2 S5 J- b5 B R0 ]0 m0 Q
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
2 Q0 J- B9 p+ |% Z$ r. ^2 J - # print BookIDs9 S+ q! y8 _, U! a$ \6 D7 E, I' L
- Book_Dates = str(datetime.datetime.now())
1 [5 F+ r8 }+ S* ] - for imgs in response.doc('.bigpic > img[src^="http"]').items():/ ] P5 q. {/ }" `, j& ]
- img = imgs.attr.src
) p+ L, i) u2 u1 ~6 X- d - print img( ^/ G- y" a/ L( I' Y0 ]
- #小说封面下载- Y% Z: G. ^+ O; W3 m
- extension = self.getExtension(img)
- g) B5 U& M% D( t& k2 K - name = self.getname(img); T, `9 R2 ]3 l6 a4 Z3 `/ k
- file_name = name + "." + extension
6 g) S7 _ @; ~& _8 d - imgDir = P_dir + name1 `8 S6 W; p, n3 P$ R3 J2 v
- Locaimg = imgDir + "/" + file_name
; a) x, o4 ?, k0 a! I* J - print Locaimg& X9 m: H+ d& e( z: ?7 H
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地0 ?; S- H9 l5 g. c
- print('attachment url is ' + img) #
- d1 R' h% A8 K" u* J. v5 a - Datos = {
/ \6 Z7 ~3 G; N6 {7 ~/ H. L6 T - "Cater_Name":Cater_Name,
/ b8 y k3 G9 P0 [/ H2 W" A1 J - "Book_author":Book_author,
# c4 C& K- g) S6 m# |* Q" H - "Book_Introduction":Book_Introduction,
/ o" B& |& Z0 g- J - "Book_Synopsis":Book_Synopsis,9 Q1 z& B- ^/ W
- "Book_Palabras":Book_Palabras,6 x- r8 s5 J1 c& a, k
- "img":img,- a2 d O6 i3 e3 Q# _0 G* I
- }! B; k7 H$ a9 v" l! E# m( O; L# ~
- self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布3 I: C3 `/ t0 { O' \+ w, W7 X2 ~
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():0 N0 F9 p$ D2 j. \* p% |4 L
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)
9 @8 l1 q- w6 ]7 v0 A g - . p- r2 J# w; n+ Z; i* V
- @config(age=8 * 60 * 60)
7 z7 F3 x0 k# O0 g" P - def index_page(self, response):
$ U$ l( A: n( `# T4 N; S8 u3 ] - Datos = {) y, T0 j X& v2 T
- "Cater_Name":response.save['Cater_Name'],
2 X1 z0 L. ^# f2 B - "Book_author":response.save['Book_author'],: F7 t Z# ~* W! _7 J0 e7 {
- "Book_Introduction":response.save['Book_Introduction'],) [/ [. { v: S$ h2 t
- "Book_Synopsis":response.save['Book_Synopsis'],5 k& {7 u' v1 S( L. t
- "Book_Palabras":response.save['Book_Palabras'],
4 K) j7 s( M/ A( L4 e8 K" k! t- O - "img":response.save['img'],
0 Y j2 [6 N) w' O - }
+ F( H5 A4 ^4 m! X8 B; U - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
2 n/ r6 T+ M/ t% l1 S - # for each in response.doc('.chapter-list a[href^="http"]').items():
# X" v. [3 h, M - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
2 c' Q; x. ?% F8 L& i - @config(priority=2)" W3 d4 l* {2 d+ v8 X+ ~
- @catch_status_code_error' \/ z$ S% l" h: I0 ]
- def detail_page(self, response):
2 Q, C! u* |. q9 X, j H" J8 [! ] - NewRe1 = u'哈书'4 G1 d. k" g! S# b5 r
- NewRe2 = u'huhjsd.CC'
' i9 [* b. J6 v2 C+ A% X6 ~ - NewRe3 = r'^\\n\\n' x+ E% R! T* U3 A
- NewRe5 = u'小说网'3 j# M- e' s$ G" Q z
- NewRe6 = u'fgdfgf'
5 i& B3 d. B6 v8 J! |" Q - NewRe7 = u'fgfgf'8 F( a# X6 M$ M) _
- NewRe8 = u'ffhgf'. O: _# ]3 _- q5 U- G0 ]
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
" `* y- z3 n) I# I9 t. Y8 M - ReC1 = u'静思'
8 U7 E5 I5 V: u; F - ReC2 = u'aghgf.com'
5 s$ ~% H G1 w - ReC3 = u'aghgfh.com'
2 s+ A2 M# h: C# G* m3 W0 O+ i - ReC4 = u''3 F! d8 I- z1 Y* N. N# i
- ReC5 = u'文学网'
3 y9 ?! w7 d5 \1 B( T' G - ReC6 = r'<BR>'
$ D) H( D0 J4 K' ?, Z0 O6 U' Y - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
! ]) B3 p) p. d/ i% Y - print Bookname
- m! ^5 [5 x2 ~& I5 t a; t R - Cater_Name = response.save['Cater_Name'] # 小说分类9 t; Y1 L5 Y. [& X! L/ Y! q
- Book_author = response.save['Book_author'] #小说作者% o3 c; ~6 h5 s
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介$ j0 ]& P, x1 S0 E, d0 h
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
% f' A& c9 |" g6 R8 Q1 o! c0 ] - Book_Palabras = response.save['Book_Palabras'] #小说字数
7 @2 m) n, s' X x l) D% j# K - Bookurl = response.url #小说网址
( B/ H6 i; R& X% U, \% F7 c2 J - Booktitle = response.doc('.article-title').text() #章节名称3 E; W$ z9 m0 E. G. c0 [
- BookID = response.doc('.readset-r span').text() #小说ID! n) y. Q/ w0 u4 F4 h9 \) ~
- BookConte1 = response.doc('.article-con').text() #小说章节内容
0 i' n7 w# l+ ]8 C( K - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成) \+ X' t& N& q# M1 ^' U
- Book_Date = str(datetime.datetime.now()) # 采集时间! a2 N8 i1 |! D* f1 L( Z5 L
- BookConte2 = BookConte1.replace(NewRe1 , ReC1); r- R+ h$ }* u! p7 G9 |
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)6 u/ T1 i9 S) w! i' S0 B$ M
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
# q) m( X5 u# a- w9 \) l0 ? - BookConte6 = BookConte5.replace(NewRe6 , ReC2)9 t8 Y$ R$ J( d4 k, e& A5 J0 C
- BookConte7 = BookConte6.replace(NewRe7 , ReC2)
4 \+ |% r' C6 z! C2 _ - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
* T, P4 }& T' v& _ - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
& d7 Q m4 P6 j3 _1 K. ?4 ] - BookConte = BookConte4.replace("\n\n","<br>")% J$ x0 y- a7 I8 p* `2 I# ?
- print BookConte
# C0 h8 e& V. t& O - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)5 b; ?/ K: Z u: t
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)9 H* l2 b$ S, s2 q- z
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
' m# S& E! U: k: t; y - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)6 f. |5 K4 O K* ~ \
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0] ' k9 j+ s* ?( s4 z
- Book_img = response.save['img'], #小说图片, h4 @0 S( Y' c" w- l& X" X
-
' G7 H" Y, W2 r9 u: t' J1 Q6 b - #insert into MySQL 小说入库
. j' t& q' r, M% e/ a - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布) Y4 o* v7 O+ l
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布' x' J9 j1 u3 V' U/ g
- #post提交发布$ e; \$ v$ s6 t* u
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消
8 s8 S2 ]2 m: I$ F - Datos = {7 l, e0 m+ }: y0 X2 H
- "Cater_Name":response.save['Cater_Name'],. @5 h! l# Z% l8 e
- "Book_author":response.save['Book_author'],+ W7 S! a- c; f* ~
- "Book_Introduction":response.save['Book_Introduction'],
S8 ]3 W" e1 I" o' F$ U" O1 M - "Book_Synopsis":response.save['Book_Synopsis'],
+ u5 Y; e- B, `% d% R1 ] - "Book_Palabras":response.save['Book_Palabras'],
* v b# O7 Z/ p9 V! o! i - "img":response.save['img'],
& {; \9 ]5 N3 u& y8 ] - }+ _6 ~; }( g) N6 O9 u0 z
- for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
) Z8 O/ J$ k* Q# D1 c - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
; n+ B4 ^; C0 J7 X - return {* @% G: D/ x$ T/ o7 V9 h$ u4 ?
- "Cater_Name":Cater_Name,2 T" _ l$ ~4 q4 g! r
- "Bookname":Bookname,! I& d7 h- U" x! w0 v8 \ `& f
- "Book_author":Book_author,
9 N& o# ?: H9 M - "Book_Introduction":Book_Introduction,/ B% I% z3 v M8 Z) R
- "Book_Synopsis":Book_Synopsis,- i. }4 I$ S# M
- "Book_Palabras":Book_Palabras,/ }' y- c7 A# W7 ?+ n" v
- "Book_img":Book_img,
/ D$ B C" }' f$ d9 }: r - "Bookurl": response.url,
/ V/ T, H: O- O. ^ ^ - "Booktitle": Booktitle,
`( {: X! ^2 C5 L4 E - "BookID": BookID,
' V' s* W% o" ^2 G/ I0 w! x - "BookConte": BookConte, C" V& ]! C) X; ]1 O
- "Titleid": Titleid,) ?. ]& E- H* l3 ~; ^& P
- "abover":abover,
- t, n: e2 \: U - # "Book_Date" = str(datetime.datetime.now()),1 x6 W. Q! l9 j. F
- }/ N6 P3 T* p8 D5 {, O7 S
- def download(self, P_dir, imgDir, file_name, Book_img):2 M0 i# c, s7 Y6 G$ W6 s
- if not os.path.exists(imgDir):
6 s2 W2 B3 t, x) Q - os.makedirs(imgDir): Y* A `% D- @+ `$ J! O* Q
- file = imgDir + "/" + file_name
/ ]% A! K. j2 b# A8 O9 s3 s% k2 G - # print file' P! A3 c4 S9 b
- f = open(file, 'wb+')
/ D4 N' ^* M d# S% U - imag = requests.get(Book_img) 4 C9 z9 F& T; |- k' y0 m+ }% Z
- f.write(imag.content)7 N3 `) _, \; t7 U1 ^
- f.close() [8 q4 h' Y* E8 c8 W: C! j
- #保存图片前" J0 o# B8 V% W0 w2 }! x
- def save_imgs(self,response):
q! W5 ?% d7 ?& h' ^6 { - content = response.content
8 z' k/ l0 d4 |2 r$ t- Z' f( d - file_name = response.save["file_name"]4 v% y( [* v8 ~- F3 a1 W% R
- imgDir = response.save["imgDir"]
5 a5 R" d. Y! Y' q1 T8 U - file_path = imgDir + file_name
% I3 Q% y2 l S8 a - self.save_img(content,imgDir,file_path)# S% V! W, q1 H/ \
- #保存图片3 b; `- a; f. G. c( t( H
- def save_img(self,content,imgDir,path):0 }9 w, K9 W( G6 S3 @9 r1 s- X% ?
- if not os.path.exists(imgDir):
- c9 w& g7 h# e# v9 \, { - os.makedirs(imgDir)
: ^$ S; D/ B! x. U! ]+ R2 T - f = open(path,"wb" ): I* ] b! c: r) P: W5 L0 t3 \
- f.write(content)
, e# s3 P% t/ E, B6 x6 D/ J - f.close()
; N7 K9 I- u* c4 u, v. @. s - #获取url后缀名7 V, |4 V* @( G4 Q$ ?
- def getExtension(self,url): ! T8 [: m) a+ N [& `
- extension = url.split(".")[-1]
+ ^# w: \6 W0 Y P - return extension : _ f' i8 d1 o
- 6 M8 W7 {$ |$ E- W
- #获取图片名8 u5 W5 `7 R; j# o& f
- def getname(self,url):1 m* F0 R( {& \3 ~
- name=url.split("/")[-1].split(".")[0]4 n. u# }4 R4 J; \- ]$ V) C
- return name
复制代码
+ V, @; X/ ]4 N9 d& w
5 @8 \; k9 Q/ t* H% ~& z% u! i) m( ? |