This repository has been archived by the owner on Apr 13, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 63
/
decaptcha.py
121 lines (114 loc) · 5.24 KB
/
decaptcha.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# __author__ = 'Yaoshi'
# -*- coding: utf-8 -*-
# This py file is to the split the captcha into single alphabet or number
from __future__ import division
import time
import urllib2
import socks
from sockshandler import SocksiPyHandler
from PIL import Image, ImageEnhance, ImageFilter, ImageGrab
class Decaptcha:
def __init__(self, new_img_id, counter, number):
while counter < number:
print 'Now processing pic no %d' % counter
tmp_file_name = self.crawler(counter)
temp_img_id = self.img2binary(img=tmp_file_name, new_img_id=new_img_id, counter=counter)
print 'Get %d alphabet or number', temp_img_id - new_img_id
new_img_id = temp_img_id
counter += 1
time.sleep(1)
print 'Cut out success ratio:', new_img_id / (counter * 4)
def crawler(self, counter):
ip = ''
port = 1080
url = ''
filename = "images/MJ%d.jpg" % counter
opener = urllib2.build_opener(SocksiPyHandler(socks.SOCKS5, ip, port))
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36')]
response = opener.open(url)
htmlData = response.read()
f = open(filename, 'w')
f.write(htmlData)
f.close()
return filename
def add_background(self, image, counter):
img = Image.open(image)
x, y = img.size
p = Image.new('RGBA', img.size, (255, 255, 255))
p.paste(img, (0, 0, x, y), img)
p.save("bk-images/MJbackground-%d.png" % counter, "PNG")
return p
def img2binary(self, img, new_img_id, counter):
img = self.add_background(image=img, counter=counter)
assert isinstance(img, Image.Image)
img = img.filter(ImageFilter.MedianFilter(3)) # use median filter to de-noise
img.save("filter-images/%d.png" % counter, "PNG")
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2)
enhancer = ImageEnhance.Sharpness(img)
img = enhancer.enhance(2)
enhancer = ImageEnhance.Brightness(img)
img = enhancer.enhance(2)
img = img.convert("L") # transfer the image into gray-scale
img.save("gray-images/%d.png" % counter, "PNG")
length = img.size[0]
width = img.size[1] # find the length and the width of the image
# print 'size-length:', length, 'width', width
counter = 0
num_of_valid_pix = [] # this data structure is to store the number of valid pixels for each column.
pixdata = img.load() # load the image data into the @pixdata
# retrival all the pixels in the image
for x in range(0, length):
for y in range(0, width):
# change the valve
if pixdata[x, y] < 200:
counter += 1
pixdata[x, y] = 0 # reset the pixdata to binary form, 1 represents for valid pixel
else:
pixdata[x, y] = 255 # reset the pixdata to binary form, 0 represents for invalid pixel
num_of_valid_pix.append(counter)
counter = 0 # this counter is used to count the number of the pixel for each row
letter_col_id = []
i = 0
# the following part is to separate the letters out from the given CAPTCHA.
while i in range(len(num_of_valid_pix)):
letter_id = [] # @letter_id stores the cols for each letter
# letter feature: there must be blank cols that contains no valid pixels in the column
while num_of_valid_pix[i] != 0:
letter_id.append(i)
i += 1
if letter_id:
letter_col_id.append(letter_id)
i += 1
# check the num of lines for each letter
numofLetters = len(letter_col_id)
# this part is dealing with the saparated
for j in range(numofLetters):
colsForLetter = len(letter_col_id[j])
# if colsForLetter in range(3, 25):
if colsForLetter in range(5, 14):
file = open("trainingdigit/demo%d.txt" % new_img_id, 'w')
# listbuffer = []
newimg = Image.new("L", (len(letter_col_id[j]), width))
# newimg = newimg.load()
for y in range(width):
# rowbuffer = []
i = 0
for x in letter_col_id[j]:
# rowbuffer.append(pixdata[x, y])
# newimg[i, y] = pixdata[x, y]
if pixdata[x, y] == 255:
file.write("0") # 0 for there's a invalid digit
elif pixdata[x, y] == 0:
file.write("1") # 1 for there's a valid digit
newimg.putpixel([i, y], pixdata[x, y])
i += 1
# listbuffer.append(rowbuffer)
file.write("\n")
file.close()
newimg.save("trainingdigit/letter_%d.png" % new_img_id, "PNG")
new_img_id += 1
return new_img_id
if __name__ == '__main__':
handle_class = Decaptcha(new_img_id=0, counter=0, number=400)