Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!
0 x2 C4 |" M' i- #!/usr/bin/env python+ A, i! H; U0 f% ] v
- # -*- encoding: utf-8 -*-
% p% w( n" u+ {6 {- C9 S% b - # Created on 2019-05-05 21:43:11/ O& |: A- b0 h
- # Project: XiaoShuo* E. } r: o" ~/ H% f1 J2 g$ F9 m2 |
- % U2 p5 }- M$ g+ y% S& Y% X3 Z* s
- from pyspider.libs.base_handler import ** _( e& _3 O9 U. h/ V
- import pymysql
) B+ p7 ], x- i, X8 j- K: a" M - import random# ?% M% a9 t; t% Y+ t0 g$ T6 W* N
- import datetime: Z# m& m* Z$ ^# }4 B. V
- import urllib2,HTMLParser,re1 z g/ S f2 x" `1 ^( O
- import os
% G* N$ U# J$ l2 b - import sys
, P# Q5 m1 ?- q' n2 X5 F - import re$ c5 m t6 y% z( j- ]
- import codecs
( ^. r6 a' ?1 z* ~- Z3 g" p# X - import requests
% H5 E& m/ D' O# F( @. G8 ~* B - import json
# _( Y1 B* R3 Z: q5 [0 c -
6 ?- Z; N! A: N: Z - class Handler(BaseHandler):
/ S2 S4 C, `7 k/ W2 t - global Datos5 F9 {# D' l6 o% ^: p8 z
- global P_dir
# F- l5 \* d4 D' u7 e& q: P, Z - P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径
8 N: D$ e3 a3 z: ?" t% E% q - global Datos
1 C4 p$ X! \9 C3 G4 o6 E6 ? - Datos = {}
7 T& i9 U% {$ Q _3 u. o - headers= {/ j* P3 L, z+ `, [, P) c% o
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',' {% K% o/ e* d
- 'Accept-Encoding':'gzip, deflate, sdch',
# }4 w4 w. l/ A W( O+ { - 'Accept-Language':'zh-CN,zh;q=0.8',. O/ x- D9 U7 Q" d3 Y/ H. }# e7 \% {
- 'Cache-Control':'max-age=0',
- O2 f6 V* ]2 o/ W - 'Connection':'keep-alive',
3 i& M/ ? R# N, y4 M3 I - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
$ [! ~# e- R- g* i( E c. h* L - }: \: J P4 y3 u
- crawl_config = {2 S4 R" Z# Y g |, F
- 'headers' : headers,
% d# p6 @2 K- p# f, k5 P0 ^/ T - 'timeout' : 300
) h6 ^, }: y: Y# i8 P - }" C* B3 i; d5 A' N# ?
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):
& I, n g6 p: P( m4 x - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
. i$ _/ @% k3 l/ {9 c6 ^! r6 G - try:1 \. Z# {6 T9 g9 c
- cursor = db.cursor(), X% Z0 a$ O0 H( \# @
- #注意此处字符串的占位符要加双引号"%s"4 G# g o, H/ J& f5 @
- sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);$ Y X9 ?! I+ `
- # print(sql)
* h4 I+ l# _) D9 J - cursor.execute(sql)& H6 L t t' j4 ]5 l" k( c; O
- 1 u; |7 [& i' F" E8 g/ J3 R8 h6 d
- #qid = cursor.lastrowid
]% a, r1 r; Z9 j2 E - #print(qid)
! {0 _7 c- j# g. S - $ x U3 [: U3 R4 M7 y( a
- db.commit()
. H8 u: R- o6 A; o* ~ - except Exception as err:
. L. D; W8 Y/ f( n @3 F - print("Error %s for execute sql: %s" % (err, sql))
" _5 t4 q& T; f/ r2 G - db.rollback()4 f" D2 q* S1 e/ y! v/ f( V* D* a
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
9 s O+ n& ] ~5 r/ D - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")& Z6 v0 {, y* {! b! X* M
- try:4 n. h2 X @6 U
- cursor = db.cursor()
: O9 v( R2 j5 ? r1 B* x - #注意此处字符串的占位符要加双引号"%s"* a* t1 P! A' `: U0 H( O
- sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);9 x, P( g9 j/ `
- # print(sql)9 H& F3 W8 s u" C2 t p( y
- cursor.execute(sql)
5 Y! g5 j" b6 K) e& e -
: X; B1 \6 M" S+ i - #qid = cursor.lastrowid
+ p% N& B& l8 E - #print(qid)0 V) \- i P% I* j
- 2 i8 [1 b& t2 r3 j8 c' p
- db.commit()' `- P4 j. Q3 K6 O
- except Exception as err:
4 k. B, C/ K _3 u$ n) C: m& w- |0 O8 ~ - print("Error %s for execute sql: %s" % (err, sql))
# ~0 d$ Y1 E4 W' C5 r - db.rollback()
5 S) q, Y0 C4 D; ~; y4 i - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):& v9 G, P* \ { f
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")
( [" X* V& G4 X0 D) Y1 E5 A - try:
: u; P- }5 a0 k- V - cursor = db.cursor()0 f) r: y! J$ U7 ~
- #注意此处字符串的占位符要加双引号"%s"# u* j# J6 [" G; ^
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);: E5 D- m$ W' U9 A0 }( F! f& |- j
- print(sql)
0 ?) ~) h! a6 q @ - cursor.execute(sql)
/ \* Q$ H) i; v, ^7 ^) I p - print(cursor.lastrowid)& f# L/ ^+ z6 \8 j7 d; [$ m% W
- db.commit()
3 m# C+ T3 M& A( s) z2 z* P: e& q - except Exception as err:
4 |, Y2 D+ z; O3 m$ T2 @ - # except:- h% Q& H8 q3 }: m" \& ?& ?
- # print('Failed')
- y9 d3 H) \( |2 A" Z8 w - print("Error %s for execute sql: %s" % (err, sql))
. z! A/ I& k w4 g; Y$ q' k2 u - db.rollback()% }) _" T5 x/ O( J% e7 [
- 9 p" l; y: e; w- P
- def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover):
7 c. n4 W* a! ~0 K( \& R) u - reload(sys)
2 b4 d. m6 a6 Z ^5 \% u0 I - sys.setdefaultencoding("gbk")+ d+ e: ~8 S7 w1 `
- locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址; ]6 S1 P9 D4 [( `; V! r
- locoy_data = { ?$ B5 x3 @$ V1 p
- 'my_u':'用户名', #后台用户名$ N; i& l2 { R! c1 i& p
- 'my_p':'密码', #后台密码
0 t$ z! a1 X: b/ U - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),0 c7 j; n. o. ^4 n# {
- 'caid':Cater_Name.encode('gbk', 'ignore'),5 S8 B+ c# W* f7 e. {. ?
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
6 L+ q; h( @. k6 Q* y - 'article':BookConte.encode('gbk', 'ignore'),7 V: x) D( b, \( s5 _6 _
- 'author':Book_author.encode('gbk', 'ignore'),6 U& X S* j S ?& d5 H4 ~1 f4 B
- 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
! g2 X, ^+ I" c" F S4 J4 [" a - 'thumb':Book_img,
0 }; n, k! ]! Z, F; F - 'content':Book_Introduction.encode('gbk', 'ignore'),
5 B4 T2 K: p0 x D, C0 | - 'abover':abover.encode('gbk', 'ignore') . j, G$ `( v( N' Z7 ~
- }
9 n) K# e5 Y& I5 f- P4 v - res = requests.post(locoy_url, data=locoy_data)7 \+ o: n6 _6 q7 E/ i
- print res.text( R! G3 i* n: J2 ~( @
- print res.content
7 R. t/ I- G1 I( R3 x+ }* | - # print Dsd
0 q; t3 B4 Q6 ~+ q O& q, d - return res$ [, K( ^" H3 a9 ?; |
-
% @4 ~: k ~) ^# E3 h4 b. n* | - def __init__(self):1 c. l# R6 G9 \: y
- self.base_url1 = 'https://www.****.cc/'+ g8 t! g1 _/ ?; C6 K/ q
- self.base_url2 = '/'" i' M& a: j0 j9 k
- self.CaterId = []
$ R3 H4 B; D/ R; j" e - self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']
9 o5 D7 l6 U8 n+ c% c - self.page_num = 1
2 S/ k/ w3 u `" S& [, I( u8 ] - self.total_num = 200 / x& Y8 }* H U" i
-
$ ^" }$ h- t7 Q t- F6 j - @every(minutes=8 * 60)
9 m$ Z8 y+ G: ^: }0 D - def on_start(self):
. L- \- p4 K7 l; O' g6 m - global Cater_Name
7 L, X- A* l& v# ?; j. |) s - Cater_Name = []0 b' K; N! ~, f: R# t# X
- while self.page_num <= self.total_num:
! |/ ?2 l2 d9 d( K$ F& U# X! U: f - for self.CaterId in self.CaterIds:
* f' A- d ^8 l( L( E6 @8 O& o1 I - if self.CaterId == 'xuanhuan':5 H+ o0 n0 C- A% O
- Cater_Name = '玄幻' @5 y P5 r9 O6 Q R2 `
- if self.CaterId == 'wuxia':
* J: a" e" T! S& u4 s - Cater_Name = '武侠'1 o! I0 o y9 Q/ k' K5 }9 q
- if self.CaterId == 'lishi':. [0 P: _4 |2 v" O/ ~: V
- Cater_Name = '历史' $ M3 M6 w* g% R( ~( O ?- a0 V
- if self.CaterId == 'yanqing':
1 p! ?/ p7 ^: x) O# b - Cater_Name = '都市' ! K1 }9 \4 U- B6 S9 B
- if self.CaterId == 'nvsheng':! n8 D& i# J( s3 K0 _% @0 Q
- Cater_Name = '都市' 7 M! U$ e) b* [% w
- if self.CaterId == 'kehuan':+ v5 k/ W4 ]. \+ V- v
- Cater_Name = '科幻' / R6 d# c: K5 n3 O6 q
- if self.CaterId == 'kongbu':
, M" j W* z Q) E: n - Cater_Name = '游戏'
+ g' o# _$ {; i* c - print self.CaterId
( Q4 V2 b' ]; h/ _) f) R - url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
; P: \% n% N) Q - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
; c' U7 [- j0 B4 I" \5 J/ [6 T; D2 B - self.page_num += 1
! Q/ S$ y( Z7 m8 E w) V0 ^ -
4 ]- p4 [" {" {! c: _3 ^$ n - def list_Caterg(self, response):
! \5 X: o& b. g8 Q - Cater_Name = response.save: M7 {& B4 g# |3 a6 ?! O( q
- for each in response.doc('.pic-list a[href^="http"]').items():* _8 u7 u% X" D5 s7 }4 O( h
- self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name); y7 L) F6 {. n0 d! e1 U4 r( u! n
-
+ q7 B; t) W: U - def list_Caterg_detail(self, response): o- ?- E+ | O: Q) J' T% @9 A
- Cater_Name = response.save" G- N) S$ C6 x% p/ E
- # print Cater_Name
& \. m6 q& l+ j( t" P5 K - Bookname = response.doc('h1').text()
$ m6 u7 y; a. b* O* ]" S' v! c - print Bookname9 F$ |- k' v; N& L. d
- Book_author = response.doc('.authorname > a').text()7 Q# P. F- u/ V8 Z/ o3 \8 K% ~$ S/ b
- # print Book_author$ y8 d- B+ ~: [
- Book_Introduction = response.doc('.book-intro > div').text()( x3 Z ^& h5 p( F, |! \" I. N
- # print Book_Introduction
0 f7 n* X2 z' f- I7 ? - Book_Synopsis = response.doc('b').eq(1).text()# H! L5 b: k( k$ X
- # print Book_Synopsis
: C& F9 m4 [3 v1 V- \# B - Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]$ l5 d* `( u: R& X
- # print Book_Palabras
* A; }/ ^# b% H0 T4 b- I5 L1 {) E - BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
- n3 E+ u: J+ X3 o - # print BookIDs- i/ L8 [' F8 J2 H
- Book_Dates = str(datetime.datetime.now()) Y2 ^- e# j8 ~7 \( N0 D
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
4 {! U' V! B; o# o - img = imgs.attr.src
1 U$ \9 Z7 O; |- X' D - print img
" x6 e, P( Y; x0 g - #小说封面下载& E) }, B: c1 P3 X$ C
- extension = self.getExtension(img)# B8 U0 x. C! F6 b$ _" g8 O, m: I
- name = self.getname(img)% g6 H, |! V8 n$ L) d
- file_name = name + "." + extension2 c2 k+ t/ L$ b$ {; @* y- \
- imgDir = P_dir + name& a4 q v3 ?( P$ d g3 h
- Locaimg = imgDir + "/" + file_name/ V: R: I( L0 Y. y, I) u# J& L% I
- print Locaimg
" c$ Q0 }0 q% N* `" o - if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地
: N0 F$ m9 y* L# q/ p/ L8 J - print('attachment url is ' + img) #0 i% C5 Z6 E( N" f
- Datos = {, D6 I# A. l- |6 e$ v$ y$ s8 E
- "Cater_Name":Cater_Name," c+ X8 X, g5 }; Y: c% p- ?# ?
- "Book_author":Book_author,
+ p9 x2 Y; C6 d& C% v - "Book_Introduction":Book_Introduction,
; _$ I5 U7 c! n' v- M - "Book_Synopsis":Book_Synopsis,+ J7 D$ z0 Q) K- _5 P: C
- "Book_Palabras":Book_Palabras,; l! b2 S! C: T8 s
- "img":img,; L7 |/ A! _: Q2 ]$ L& g7 v; ^
- }
9 z, p; r/ r2 B9 V% ] - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布% v4 c+ y# r! i: o# S
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():
) Z/ |' c( V# c5 ^& B - self.crawl(each.attr.href, callback=self.index_page,save=Datos)
% ~; @6 J6 v+ N; Y; } -
" N4 X4 x# {0 L5 W- q- ^ - @config(age=8 * 60 * 60) 1 b& J* R. ]" z, c* c# `! ?0 `5 ]
- def index_page(self, response): 2 |+ b/ z0 y" l- O2 Z
- Datos = {. U# h; R2 m3 m2 p+ |0 `* y( h) t
- "Cater_Name":response.save['Cater_Name'],
$ K6 u4 j8 E7 ~% m* [5 I - "Book_author":response.save['Book_author'],+ d/ v0 N& D) A4 b j/ g" c5 E
- "Book_Introduction":response.save['Book_Introduction'],
) f. b$ [/ e2 Q' y/ d1 _! H: f - "Book_Synopsis":response.save['Book_Synopsis'],; g, @8 a7 [1 p& J; ~# a
- "Book_Palabras":response.save['Book_Palabras'],: d& ?, y+ r# J" E2 Y
- "img":response.save['img'],7 J( O0 Y+ H4 _/ F) z
- }6 i! D% Z7 w6 T3 K2 s: q% B. J, s
- for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
/ {3 f$ a$ g3 l& q: W+ f - # for each in response.doc('.chapter-list a[href^="http"]').items():
3 k, O/ C( V' W - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)
7 W# _' L3 a5 g% x: W- G& x - @config(priority=2)6 k6 ?3 `+ o l9 \& |2 u4 p3 a$ v) T
- @catch_status_code_error
1 c- q/ i; w: @* i/ ~3 ?& _/ h - def detail_page(self, response): 1 i$ \6 F4 H9 Z/ t
- NewRe1 = u'哈书'
9 C. ]$ _: k. q" m0 o" \/ s5 Q - NewRe2 = u'huhjsd.CC'
7 f6 n0 V( o3 n9 ~6 e5 s - NewRe3 = r'^\\n\\n'
- R0 a4 g: W- }+ y1 j" B - NewRe5 = u'小说网': U8 m' D& m- X- Y/ ]$ U
- NewRe6 = u'fgdfgf'7 k& M( H6 R$ Z$ r
- NewRe7 = u'fgfgf': a- A( h( L2 A( x8 x* x
- NewRe8 = u'ffhgf'5 ]' h$ \* d. d( n8 c8 } L, i* ^, O0 Z
- NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
/ F, t& Q; M7 z8 f - ReC1 = u'静思'
1 p0 M6 R: V0 L& U0 Y# d - ReC2 = u'aghgf.com'" M) m- L( i+ {' i) v' m- {& F H
- ReC3 = u'aghgfh.com'
" m1 G! o- Q% {/ [; }& l9 S! h - ReC4 = u''8 F! _) M4 ]; z4 P' @
- ReC5 = u'文学网'
) X' \" n0 I2 D$ l* I - ReC6 = r'<BR>'
9 c0 N0 k0 d }/ D. W# ] - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称
; K7 w; o4 A5 V3 g8 _ - print Bookname( ]$ ]( G* X; p" D8 i1 k8 {+ E
- Cater_Name = response.save['Cater_Name'] # 小说分类
( ^: A0 q0 i8 H6 I$ @7 P) ?3 f - Book_author = response.save['Book_author'] #小说作者+ V& F' t( U' ]) }$ t) @, v" N
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介6 B7 D3 k: ?8 d' J" F( I
- Book_Synopsis = response.save['Book_Synopsis'] #最近更新
z6 _4 `9 U* e) K0 A0 ~2 i4 V - Book_Palabras = response.save['Book_Palabras'] #小说字数- O6 W0 k& y3 p: {8 I: W! m
- Bookurl = response.url #小说网址/ W6 q" j' I$ Y2 ]9 U$ u
- Booktitle = response.doc('.article-title').text() #章节名称
# z! m1 D. ]9 Z. f7 ]5 s - BookID = response.doc('.readset-r span').text() #小说ID
7 ]% S; `% J) p+ w! g9 r; ]% p2 Q: @ - BookConte1 = response.doc('.article-con').text() #小说章节内容
+ L$ c9 U' Y7 \* w8 o9 d2 L [ - abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成)
! \6 P, G) S, c1 W5 F - Book_Date = str(datetime.datetime.now()) # 采集时间
! E* y1 `6 Q+ L& x: C* O1 O7 X+ ~ - BookConte2 = BookConte1.replace(NewRe1 , ReC1)- x- B4 Y! b' m {! ~4 G, k8 ?8 n% E
- BookConte3 = BookConte2.replace(NewRe2 , ReC2)2 P6 V0 X) ~2 X& u' L# O3 @
- BookConte5 = BookConte3.replace(NewRe5 , ReC5)
. \+ ^% `2 u0 O% e+ r - BookConte6 = BookConte5.replace(NewRe6 , ReC2)
# v" a4 u; @9 u- w - BookConte7 = BookConte6.replace(NewRe7 , ReC2)
+ `7 X7 Z% b0 C- H7 Q2 g - BookConte8 = BookConte7.replace(NewRe3 , ReC6)
2 t& E! @0 B. q1 O% l: }. E - BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
* T ]4 V1 w4 ^* D: N - BookConte = BookConte4.replace("\n\n","<br>")3 x+ s5 X \ C) Y% z! d, D' F5 F
- print BookConte( ^: ^0 [8 p6 i K) V0 F, x. m1 I" ~
- Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1)& |) I# z. d" S+ b; @3 q+ u
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)9 g1 h( J% |! ^3 B
- Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)
4 P X7 }1 Y& f+ o, y - Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4); F X6 F, E5 X# J$ e4 i8 C
- Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
& E; `" K7 z/ o/ _( ]/ C( I- P - Book_img = response.save['img'], #小说图片
/ {0 X Z" L6 v6 k" h0 F( x6 E5 M8 f - " Y) E. `: H* C& p; v- h6 x
- #insert into MySQL 小说入库
# c/ Y. B3 {6 \, K. u - self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布6 A3 V5 \2 Z+ V6 L5 o
- self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布' }) S0 G* _! F5 Z
- #post提交发布3 @; l" Z7 x S6 ]5 F& V
- self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消* n* c1 y6 z5 Y/ {& @
- Datos = {9 X& j4 C- o3 R" `! [0 P
- "Cater_Name":response.save['Cater_Name'],6 `) ?( r+ F" H2 q2 Q7 Z& K
- "Book_author":response.save['Book_author'],
8 s) {: m; A4 ` - "Book_Introduction":response.save['Book_Introduction'],
+ l% C6 y, [5 q+ `, N - "Book_Synopsis":response.save['Book_Synopsis'],
! i7 J% r/ f8 C& ~ - "Book_Palabras":response.save['Book_Palabras'],
I+ Y2 a5 \# ?* q: |% H3 C5 V) v7 o; B8 w - "img":response.save['img'],9 ]9 c2 s8 q. U* t" }* q1 x1 ^
- }
( ^7 ]) A0 b: ?$ M) |2 W. d. j - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
9 @( o5 X$ c6 T* \ - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 2 Q: f0 ]2 R8 c2 k- S6 T6 ^$ Z# D
- return {5 x5 c2 F2 \0 g: i& C
- "Cater_Name":Cater_Name,
+ ~' W- ~( M' S! I - "Bookname":Bookname,
7 A& I- K j3 @ - "Book_author":Book_author,
& X, a% \$ a3 L; w8 H6 v. s - "Book_Introduction":Book_Introduction," s- H* R' C! `) M5 K$ d7 G
- "Book_Synopsis":Book_Synopsis,2 F/ a' i" W- c: d8 D
- "Book_Palabras":Book_Palabras,
7 k6 ]. c* M- k- a7 ? - "Book_img":Book_img,0 k1 I* _9 E+ _# X: t
- "Bookurl": response.url,3 W# A, m. L s) _
- "Booktitle": Booktitle,
% _# L# H4 {% P6 ^6 g - "BookID": BookID,
H+ ^9 s4 {0 j; j g3 {1 X - "BookConte": BookConte,
- ?" t5 }; Y0 H% T* {1 ~% S - "Titleid": Titleid,
5 e W) p8 |# c: n7 J+ B. q - "abover":abover,
4 A5 K0 S2 l6 H" S - # "Book_Date" = str(datetime.datetime.now()),
( D/ N6 D. [) s$ \+ j. [ - }$ c# M/ T4 J& P, P1 H3 J
- def download(self, P_dir, imgDir, file_name, Book_img):
9 k p4 G- z4 v) J - if not os.path.exists(imgDir): 6 q. J6 I, F/ ~% B
- os.makedirs(imgDir)0 T1 {% F+ o i9 }" B3 P
- file = imgDir + "/" + file_name
' q. `) S7 X/ F# A$ b; I - # print file5 l. o" K" L" [. u+ `, g
- f = open(file, 'wb+')
* b8 v7 Y/ I, d7 \ - imag = requests.get(Book_img)
5 {$ H9 q3 A4 O3 u0 D* l* W - f.write(imag.content)8 g, Y! {* @9 L4 T+ i6 b
- f.close()+ y b& g( [4 C' X
- #保存图片前1 l! [9 ]6 O6 Q$ i( [: i
- def save_imgs(self,response):
5 t' q/ ?- L) s* ^8 k' s8 {( Z - content = response.content: H' ^$ z2 m0 l# M* m( Y1 o" S4 q
- file_name = response.save["file_name"]
5 j/ Q( B0 P+ n5 n1 t& T Q; ?' _ p' [ - imgDir = response.save["imgDir"]
, v: e0 n1 u# u! @* ^ - file_path = imgDir + file_name
0 V) I9 @/ i, Q* g - self.save_img(content,imgDir,file_path)
! F3 ^6 u+ h* b) K& a+ }1 n - #保存图片2 h' A& b2 p" a
- def save_img(self,content,imgDir,path):! x5 T; b+ r$ C; i
- if not os.path.exists(imgDir): 6 I1 g- v3 _& j# k& a
- os.makedirs(imgDir); `' R7 i0 X* k2 f& F0 q
- f = open(path,"wb" )9 R: D6 p$ [$ ^5 m& c6 t7 H9 ^+ [7 I
- f.write(content): f; G( t3 J+ d
- f.close()
* z: {, G- l* `; [) x" r B$ X - #获取url后缀名
& |% ^$ f- c! r( j, w- c - def getExtension(self,url):
1 X% I# z, q& ? - extension = url.split(".")[-1]
6 c2 Q! K% R/ m5 ~+ N, I - return extension
" k4 @# I+ t$ l -
" v0 \+ J X) f2 Z( T - #获取图片名# @: P0 W( j2 H9 {( G) B$ h: P9 \% _
- def getname(self,url):2 }$ f' {) `9 @2 J- T, D w% y
- name=url.split("/")[-1].split(".")[0]
+ f5 h( o+ X/ | - return name
复制代码 " ]9 M3 |0 ^7 C4 x7 ?! N% C* a' G, z
, R# t M% \$ O |