Coding/Python Matlab

파이썬 - 웹스크래핑 연습 예제 (iMacro 함께 사용)

smores 2012. 1. 22. 15:46
# -*- coding: utf-8 -*-

from twill.commands import *
import re
import Image

def NakJang():
userid='yourname'
snum1='yourjuminbunho-1'
snum2='yourjuminbunho-2'

# 낙장불입
url='https://kr.edit.sdb.yahoo.com/verify_adult?.done=http%3A%2F%2Fkr.news.yahoo.com%2Fservice%2Fcartoon%2Fshellview2.htm%3Flinkid%3Dseries_cartoon%26sidx%3D12087%26widx%3D106%26page%3D1%26seq%3D0%26wdate%3D20100406%26wtitle%3D%25B3%25AB%25C0%25E5%25BA%25D2%25C0%25D4'

go(url)
showforms()

formvalue(1, 'username', userid) 
formvalue(1, 'snumber1', snum1) 
formvalue(1, 'snumber2', snum2) 
submit() 

# 낙장불입 전체 목록 1-12페이지 저장 - 작은 페이지번호 최신
url='http://kr.news.yahoo.com/service/cartoon/shelllist.htm?linkid=toon_series&work_idx=106&page='
for i in range(1,13):
go(url+str(i))
filename='c:\\z\\nakjang-'+str(i)+'.htm'
save_html(filename)
# 총 222편에 대한 링크 수집
for i in range(1,13):
filename='c:\\z\\nakjang-'+str(i)+'.htm'
f=open(filename)
s=f.read()
f.close()
p=re.compile('href=\"[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]+\">낙장불입 [0-9]+화')
m=p.findall(s)
for j in range(len(m)):
m1=m[j].split()
chap=m1[1][:-2]
link=m1[0][6:-10]
pglink.append([chap,link])

# 총 222편 웹 이미지 저장하는 iMacro script code 생성
# 사용 브라우저 영문 Firefox 8.0
f=open('nakjang.iim','w')
f.write('VERSION BUILD=7401110 RECORDER=FX'+'\n')
f.write('TAB T=1'+'\n')
f.write('URL GOTO=about:home'+'\n')
f.write('URL GOTO=https://kr.edit.sdb.yahoo.com/verify_adult?.done=http%3A%2F%2Fkr.news.yahoo.com%2Fservice%2Fcartoon%2Fshellview2.htm%3Flinkid%3Dseries_cartoon%26sidx%3D10617%26widx%3D106%26page%3D2%26seq%3D%26wdate%3D20100406%26wtitle%3D%25B3%25AB%25C0%25E5%25BA%25D2%25C0%25D4'+'\n')
f.write('TAG POS=1 TYPE=INPUT:TEXT FORM=NAME:pgForm ATTR=ID:username CONTENT='+userid+'\n')
f.write('TAG POS=1 TYPE=INPUT:TEXT FORM=NAME:pgForm ATTR=ID:snumber1 CONTENT='+snum1+'\n')
f.write('SET !ENCRYPTION NO'+'\n')
f.write('TAG POS=1 TYPE=INPUT:PASSWORD FORM=NAME:pgForm ATTR=ID:snumber2 CONTENT='+snum2+'\n')
f.write('TAG POS=1 TYPE=INPUT:BUTTON FORM=ID:pgForm ATTR=ID:confirmBtn'+'\n')
f.write('WAIT SECONDS=5'+'\n')
for pg in pglink:
f.write("URL GOTO="+pg[1]+'\n')
f.write("SAVEAS TYPE=PNG FOLDER=C:\Z\ FILE="+pg[0]+'\n')
f.write("WAIT SECONDS=3\n\n")
f.close()


def ImageCutAndSave():
for fnum in range(1,223):
im=Image.open(str(fnum)+'.png')
W,H=im.size
pix=im.load()
y1=250
x1=0
for i in range(50,900):
if pix[i,500][0]==0:
x1=i-30
break
x2=900
for i in range(900,50,-1):
if pix[i,500][0]==0:
x2=i+30
break
y2=H-10
for i in range(y2,1,-1):
if pix[490,i][0:3]==(254,0,0):
y2=i-230
break
for i in range(y2,1,-1):
if pix[490,i][0]<10 and pix[490,i][1]<10 and pix[490,i][2]<10:
y2=i+40
break
im2=im.crop((x1,y1,x2,y2))
filename="N-%03d.PNG" % fnum
print filename
im2.save(filename)



if __name__ ==  "__main__":

NakJang()
#ImageCutAndSave()


'Coding > Python Matlab' 카테고리의 다른 글

파이썬 - lambda  (0) 2012.01.27
파이썬 - how to sort a Python dict  (0) 2012.01.26
파이썬 - PIL (Python Image Library)  (0) 2012.01.22
파이썬 - twill을 이용한 web login & data scraping  (0) 2012.01.21
파이썬 - eval()  (0) 2012.01.20