分享一个最早接触python时写的一个图片爬虫程序,从flicker上面根据关键字抓取图片,具体流程看代码很容易理解,不过这个程序目前只能抓取第一页的图片,第二页的图片抓取不到,因为flicker上的分页是通过ajax来做的,所以如果想从flicker上同一关键字抓取很多图片的话用flicker提供的pythonapi接口就可以。
下面给出程序: 首先是一个imglist.txt文件,里面每一行放一个关键字如: 北京天安门 北京故宫
然后是爬虫程序:
.. code:: python
#coding=utf-8
'''
author:the5fire http://www.the5fire.com
'''
import urllib2
import urllib
import MySQLdb
import sys
import re
import os
import codecs
COUNT = 1
def getPicsByKeyword(keyword,yourPath,page_num=1):
print '¿ªÊŒ×¥ÍŒ'
global COUNT
sBaseUrl = r'http://www.flickr.com'
sEndUrl = r'/sizes/z/in/photostream/'
quote_keyword = urllib.quote_plus(keyword.encode('utf8'))
main_url = "http://www.flickr.com/search/?q=%s&m=text#page=%d" %(quote_keyword,page_num)
try:
content = urllib2.urlopen(main_url).read()
except IOError:
return
re_url = re.compile(r'(/photos/[A-Za-z]*/[0-9]{10})')
urls = re_url.findall(content)
log = codecs.open('log.txt', 'w', 'utf-8')
for aurl in set(urls):
pic_url = '%s%s%s' %(sBaseUrl,aurl,sEndUrl)
print '--', pic_url
try:
pic_content = urllib2.urlopen(pic_url).read()
except IOError:
continue
try:
rule = r'(http://farm[0-9].staticflickr.com/[0-9]*/\w*.jpg[\?]*[\w=]*)'
real_url = re.compile(rule)
real_url = real_url.findall(pic_content)
icor = len(real_url)-1
realurl = real_url[icor]
except Exception,data:
log.write(str(data) + '\n')
log.flush()
continue
try:
img = urllib.URLopener()
img.retrieve(realurl, yourPath+os.sep +'%d' %(COUNT)+ '.jpg')
log.write('get pic from the url:%s ##success\n' %realurl)
log.flush()
except IOError:
log.write('get pic from the url:%s ##failure\n' %realurl)
log.flush()
COUNT -= 1
continue
COUNT += 1
if COUNT > 100:
break
if COUNT < 100 and len(urls) > 0:
page_num += 1
getPicsByKeyword(keyword,yourPath,page_num)
s=open("imglist.txt","r")
for line in s.readlines():
fold_name = line.decode("utf-8").replace("\n","")
if not os.path.exists(fold_name):
os.makedirs(fold_name)
sencery_name = fold_name[fold_name.rfind('\\')+1:]
COUNT = 1
getPicsByKeyword(sencery_name,fold_name)
f.close()
cursor.close()
conn.close()
- from the5fire.com
----EOF-----
微信公众号:Python程序员杂谈
微信公众号:Python程序员杂谈