-
Notifications
You must be signed in to change notification settings - Fork 34
Expand file tree
/
Copy pathSpider.py
More file actions
73 lines (67 loc) · 3.01 KB
/
Spider.py
File metadata and controls
73 lines (67 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/python
# coding:utf-8
import os
import time
import codecs
import scrapy
from util import Utilty
from DeepExploit import Msgrpc
from scrapy.http import Request
class SimpleSpider(scrapy.Spider):
name = 'simple_spider'
def __init__(self, category=None, *args, **kwargs):
super(SimpleSpider, self).__init__(*args, **kwargs)
self.start_urls = getattr(self, 'target_url', None)
self.allowed_domains = [getattr(self, 'allow_domain', None)]
self.concurrent = int(getattr(self, 'concurrent', None))
self.depth_limit = int(getattr(self, 'depth_limit', None))
self.delay_time = float(getattr(self, 'delay', None))
self.store_path = getattr(self, 'store_path', None)
self.response_log = getattr(self, 'response_log', None)
msgrpc_host = getattr(self, 'msgrpc_host', None)
msgrpc_port = int(getattr(self, 'msgrpc_port', None))
self.client = Msgrpc({'host': msgrpc_host, 'port': msgrpc_port})
self.client.console_id = getattr(self, 'msgrpc_console_id', None).encode('utf-8')
self.client.token = getattr(self, 'msgrpc_token', None).encode('utf-8')
self.client.authenticated = True
self.custom_settings = {
'CONCURRENT_REQUESTS': self.concurrent,
'CONCURRENT_REQUESTS_PER_DOMAIN': self.concurrent,
'DEPTH_LIMIT ': self.depth_limit,
'DOWNLOAD_DELAY': self.delay_time,
'ROBOTSTXT_OBEY': True,
'HTTPCACHE_ENABLED': True,
'HTTPCACHE_EXPIRATION_SECS': 60 * 60 * 24,
'HTTPCACHE_DIR': self.store_path,
'FEED_EXPORT_ENCODING': 'utf-8'
}
log_file = os.path.join(self.store_path, self.response_log)
self.fout = codecs.open(log_file, 'w', encoding='utf-8')
Utilty().print_message('ok', 'Save log to {}'.format(log_file))
def start_requests(self):
self.client.keep_alive()
url = self.start_urls
yield Request(url, self.parse)
def parse(self, response):
self.fout.write(response.body.decode('utf-8'))
for href in response.css('a::attr(href)'):
full_url = response.urljoin(href.extract())
time.sleep(self.delay_time)
yield scrapy.Request(full_url, callback=self.parse_item)
for src in response.css('script::attr(src)'):
full_url = response.urljoin(src.extract())
time.sleep(self.delay_time)
yield scrapy.Request(full_url, callback=self.parse_item)
def parse_item(self, response):
self.client.keep_alive()
urls = []
self.fout.write(response.body.decode('utf-8'))
for href in response.css('a::attr(href)'):
full_url = response.urljoin(href.extract())
urls.append(full_url)
for src in response.css('script::attr(src)'):
full_url = response.urljoin(src.extract())
urls.append(full_url)
yield {
'urls': urls,
}