-
-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
763 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
""" | ||
__author__ = 'AJay' | ||
__mtime__ = '2019/6/22 0022' | ||
""" | ||
import time | ||
from shortuuid import uuid | ||
import os | ||
#TODO:按照随机的文件名,导出每天的新闻到对应当天时间的文件夹中 | ||
''' | ||
输入一个路径、如果路径存在、则使用路径、如果路径不存在则使用文件的路径。输出文件路径所在的位置 | ||
导出当天的新闻 | ||
''' | ||
class EexportTxt(): | ||
def __init__(self): | ||
self.base_path = os.path.abspath(os.path.dirname(__file__)) | ||
|
||
self.ds=0 | ||
self.length_p=30 | ||
self.file_size=300*1024 # 300k | ||
|
||
def _is_input_path(self,input_path): | ||
''' | ||
检查文件路径是否存在,不存在就创造一个,存在后定义self保存路径 | ||
:param input_path: 传入的路径 | ||
:return: self 保存路径 | ||
''' | ||
if not os.path.exists(input_path): # 路径函数 | ||
self.save_path = os.path.join(self.base_path, 'titles') | ||
if not os.path.exists(self.save_path): | ||
os.makedirs(self.save_path) | ||
else: | ||
self.save_path = input_path | ||
print('存在自定义路径') | ||
# self.errMessage.put('【导出文件】自定义路径') | ||
|
||
def _is_less_file_size(self): # 是否存在少于固定文件大小的文件 | ||
for root, dirnames, file_paths in os.walk(self.save_path): | ||
for file_path in file_paths: | ||
ds = os.path.getsize(os.path.join(root, file_path)) | ||
if ds < self.file_size: | ||
print('小于{}kb'.format(self.file_size), file_path) | ||
return os.path.join(root, file_path) | ||
|
||
return False | ||
|
||
def save_size_txt(self,title_set): | ||
|
||
if not self.save_file_name: | ||
file_name = str(time.time()) + uuid() | ||
self.save_file_name = os.path.join(self.save_path, file_name + '.txt') | ||
title_list=[] | ||
for i in title_set: | ||
title_list.append(i+'\n') | ||
while True: | ||
if self.ds < self.file_size: | ||
|
||
if not title_list: | ||
# self.errMessage.put('【导出文件】文件保存结束') | ||
# self.errMessage.put(1) | ||
print('导出完成') | ||
print('没有最新的消息') | ||
break | ||
with open(self.save_file_name, 'a+', encoding='gb18030')as f: | ||
f.write(''.join(x for x in title_list[0:100])) | ||
del title_list[0:100] | ||
|
||
else: | ||
print('300k文件写完') | ||
self.errMessage.put("【导出文件】文件保存{}---{}kb".format(self.save_file_name, self.ds / 1000)) | ||
self.ds = 0 | ||
file_name = str(time.time()) + uuid() | ||
self.save_file_name = os.path.join(self.save_path, file_name + '.txt') | ||
with open(self.save_file_name, 'a+', encoding='gb18030')as f: | ||
f.write('') | ||
self.ds = os.path.getsize(self.save_file_name) | ||
print('正在写入数据大小{}kb'.format(self.ds / 1000)) | ||
# self.errMessage.put("正在写入数据{}---{}kb".format(self.save_file_name, self.ds / 1000)) | ||
|
||
def run(self, input_path, title_set,errMessage): | ||
|
||
self.errMessage = errMessage | ||
self._is_input_path(input_path) # 输入的路径是否完整,如果不完整就用是否存在路径 | ||
less_file = self._is_less_file_size() | ||
print('存在小文件', less_file) | ||
# self.errMessage.put('存在小于300k文件……数据自动继续保存') | ||
self.save_file_name = less_file | ||
self.save_size_txt(title_set) | ||
self.errMessage.put('标题导出完成:{}'.format(less_file)) | ||
|
||
|
||
def check_input_path(self, input_path): | ||
if os.path.exists(input_path): | ||
print('路径正确') | ||
return True | ||
else: | ||
print('路径不存在或者不正确') | ||
return False | ||
|
||
if __name__ == '__main__': | ||
from queue import Queue | ||
errorMessage=Queue() | ||
et = EexportTxt() | ||
et.check_input_path('F:\cache\新建文件夹') # 检测输入的路径是否正确 | ||
et.run(input_path='F:\cache\新建文件夹',errMessage=errorMessage,title_set={'s','a'}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
泛目录解析程序 | ||
|
||
![](https://raw.githubusercontent.com/Hatcat123/GraphicBed/master/Img/20190625160323.gif) | ||
|
||
|
||
## 需求 | ||
|
||
给一个特定的网站域名,设置百度的蜘蛛爬虫,然后构造一个随机目录,如'isnsa/jasoh212.html'。收集返回结果的标题。对标题进行去重处理。结果保存在一个文件夹中,随机命名标题txt文件,每300kb保存一个文件。 | ||
|
||
做成界面。根据传入的值判断程序是否结束。 | ||
|
||
目标在短时间内尝试进行50w次扫描。 | ||
|
||
## 分析 | ||
|
||
界面效果 | ||
|
||
传入值:目标网站、线程数、爬取次数、睡眠时间、标题文件保存目录。 | ||
>目录长度,前目录长度,后目录长度大于4 | ||
输出:标题txt、界面标题展示、进度展示 | ||
|
||
功能:启动爬虫、暂停爬虫、重启爬虫、退出程序 | ||
|
||
## 实现 | ||
|
||
### 泛目录生成 | ||
|
||
使用随机random生成前目录与后目录字符,长度暂时固定,代码如下 | ||
``` | ||
import random | ||
import string | ||
def generate_random_str(randomlength=16): | ||
""" | ||
生成一个指定长度的随机字符串,其中 | ||
string.digits=0123456789 | ||
string.ascii_letters=abcdefghigklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ | ||
""" | ||
str_list = [random.choice(string.digits + string.ascii_letters) for i in range(randomlength)] | ||
random_str = ''.join(str_list) | ||
return random_str | ||
print(generate_random_str(6)) | ||
print(generate_random_str(8)) | ||
``` | ||
|
||
### 爬虫功能 | ||
运用python requests库,设置百度蜘蛛爬虫UA访问目标网站,解析网站内容获取标题。在这个过程中遇到标题编码不能解析的问题。导致不能够通用爬虫吧! | ||
|
||
使用html库进行解析,将unicode转换成中文 | ||
``` | ||
print(html.unescape(title)) | ||
>澳门银河的网址400-2019动画片大全 | ||
>澳门银河的网址400-2019动画片大全 | ||
``` | ||
|
||
|
||
使用多线程,线程可设置成100左右, | ||
|
||
多线程的方式参考`其他`项目中的基础 | ||
|
||
|
||
|
||
## 展示 | ||
|
||
|
||
打开exe后自动初始化项目 | ||
|
||
在配置中输入域名`http://richuriluo.qhdi.com/yl`,次数100,线程10,频率0,路径等。 | ||
|
||
点击`更新配置`后 | ||
|
||
点击开启采集,程序自动采集,上端进度条能展示当前任务的进度,点击暂停采集便能暂停爬虫,开启`继续采集`爬虫继续。 | ||
|
||
采集结束后,日志输出采集的成果,文件输出到保存的路径中。 | ||
|
||
![](https://raw.githubusercontent.com/Hatcat123/GraphicBed/master/Img/20190625160301.png) | ||
|
||
测试域名参数 | ||
|
||
http://www.taopic.com/poc | ||
|
||
http://www.kan300.com/doc | ||
|
||
http://www.kan300.com/hot |
Oops, something went wrong.