{"id":45,"date":"2016-08-04T12:50:33","date_gmt":"2016-08-04T04:50:33","guid":{"rendered":"http:\/\/ayonel.me\/?p=45"},"modified":"2016-08-25T01:47:41","modified_gmt":"2016-08-24T17:47:41","slug":"python_spider","status":"publish","type":"post","link":"https:\/\/ayonel.malash.net\/index.php\/2016\/08\/04\/python_spider\/","title":{"rendered":"python\u5c0f\u722c\u866b&#8212;\u722c\u53d6\u7f51\u540d\u53ca\u5934\u50cf"},"content":{"rendered":"<p>\u4e00\u6b3e\u4ea7\u54c1\u53d1\u5e03\uff0c\u5176\u521d\u59cb\u7528\u6237\u91cf\u5f88\u5c0f\uff0c\u8fd9\u65f6\u5c31\u9700\u8981\u4f2a\u9020\u4e00\u6279\u7528\u6237\u6765\u589e\u52a0\u5e73\u53f0\u7684\u7528\u6237\u91cf\uff0c\u642d\u8d77\u521d\u6b65\u7684\u7528\u6237\u751f\u6001\u7cfb\u7edf\u3002\u90a3\u4e48\u4f2a\u9020\u7528\u6237\u65f6\uff0c\u6211\u4eec\u5fc5\u7136\u9700\u8981\u7f51\u540d\u4ee5\u53ca\u5934\u50cf\uff08\u4e5f\u6709\u53ef\u80fd\u9700\u8981\u7b7e\u540d\uff09\u7b49\u7b49\u4fe1\u606f\u3002\u6240\u4ee5\u524d\u4e9b\u65e5\u5b50\u5199\u4e86\u4e2a\u5c0f\u722c\u866b\u6765\u722c\u53d6\u4e00\u6279\uff0830000+\uff09\u7f51\u540d\u53ca\u5934\u50cf\u3002<\/p>\n<p>\u5bf9\u4e8e\u5934\u50cf\u7684\u722c\u53d6\uff0c\u6211\u9009\u62e9\u7684\u7ad9\u70b9\u662f\u201c\u6211\u8981\u4e2a\u6027\u7f51\u201d\uff08http:\/\/www.woyaogexing.com\uff09\uff0c\u5b83\u7684\u56fe\u7247\u90fd\u6ca1\u6709\u505a\u9632\u76d7\u94fe\uff0c\u56e0\u6b64\u6211\u53ea\u9700\u8981\u722c\u5b83\u56fe\u7247\u7684url\u5c31\u884c\u3002\u5bf9\u4e8e\u6635\u79f0\uff0c\u6211\u9009\u62e9\u7684\u7f51\u7ad9\u662f\u201cQQ\u7f51\u540d\u201d\uff08http:\/\/www.oicq88.com\uff09\u3002\u9009\u62e9\u597d\u722c\u53d6\u76ee\u6807\uff0c\u5c31\u9700\u8981\u54b1\u7684\u722c\u866b\u767b\u573a\u4e86\u3002\u4ee5\u524d\u722c\u866b\u7ecf\u5e38\u7528scrapy\uff0c\u4f46\u5bf9\u4e8e\u6211\u8fd9\u6b21\u7684\u91cf\u663e\u7136\u662f\u6740\u9e21\u7528\u725b\u5200\u3002\u8fd9\u4e2a\u722c\u866b\u662f\u5229\u7528python\u81ea\u5e26\u7684urllib2\uff0c\u89e3\u6790\u7f51\u9875\u7528\u7684\u662fBeautifulSoup(\u53ef\u7528pip\u5b89\u88c5)\uff0c\u4e0expath\u539f\u7406\u7c7b\u4f3c\u3002<\/p>\n<p>\u4e0b\u9762\u662f\u5934\u50cf\u722c\u866b\u6e90\u7801\uff0c\u4ee3\u7801\u5f88\u7b80\u5355\uff1a<\/p>\n<div class=\"codecolorer-container python railscasts\" style=\"overflow:auto;white-space:nowrap;width:100%;height:100%;\"><div class=\"python codecolorer\"><span class=\"co1\"># coding:utf-8<\/span><br \/>\n<span class=\"kw1\">import<\/span> <span class=\"kw3\">time<\/span><br \/>\n<span class=\"kw1\">import<\/span> <span class=\"kw3\">urllib2<\/span><br \/>\n<span class=\"kw1\">import<\/span> <span class=\"kw3\">random<\/span><br \/>\n<span class=\"kw1\">from<\/span> bs4 <span class=\"kw1\">import<\/span> BeautifulSoup<br \/>\n<br \/>\n<span class=\"kw1\">def<\/span> <span class=\"kw2\">filter<\/span><span class=\"br0\">&#40;<\/span>tag<span class=\"br0\">&#41;<\/span>:<br \/>\n&nbsp; &nbsp; <span class=\"kw1\">if<\/span> <span class=\"kw2\">cmp<\/span><span class=\"br0\">&#40;<\/span>tag.<span class=\"me1\">name<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">'img'<\/span><span class=\"br0\">&#41;<\/span> <span class=\"sy0\">==<\/span> <span class=\"nu0\">0<\/span>:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; <span class=\"kw1\">if<\/span> tag.<span class=\"me1\">has_attr<\/span><span class=\"br0\">&#40;<\/span><span class=\"st0\">'class'<\/span><span class=\"br0\">&#41;<\/span>:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <span class=\"kw1\">if<\/span> <span class=\"kw2\">cmp<\/span><span class=\"br0\">&#40;<\/span>tag<span class=\"br0\">&#91;<\/span><span class=\"st0\">'class'<\/span><span class=\"br0\">&#93;<\/span><span class=\"br0\">&#91;<\/span><span class=\"nu0\">0<\/span><span class=\"br0\">&#93;<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">'lazy'<\/span> <span class=\"sy0\">==<\/span> <span class=\"nu0\">0<\/span><span class=\"br0\">&#41;<\/span>:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <span class=\"kw1\">return<\/span> <span class=\"kw2\">True<\/span><br \/>\n<br \/>\noutfile <span class=\"sy0\">=<\/span> <span class=\"kw2\">open<\/span><span class=\"br0\">&#40;<\/span><span class=\"st0\">&quot;.\/20160717\/avatar.txt&quot;<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">&quot;a&quot;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n<br \/>\n<span class=\"kw1\">for<\/span> i <span class=\"kw1\">in<\/span> <span class=\"kw2\">range<\/span><span class=\"br0\">&#40;<\/span><span class=\"nu0\">135<\/span><span class=\"sy0\">,<\/span> <span class=\"nu0\">1500<\/span><span class=\"br0\">&#41;<\/span>:<br \/>\n&nbsp; &nbsp; <span class=\"kw1\">print<\/span> i<br \/>\n&nbsp; &nbsp; url <span class=\"sy0\">=<\/span> <span class=\"st0\">'http:\/\/www.woyaogexing.com\/touxiang\/index_'<\/span>+<span class=\"kw2\">str<\/span><span class=\"br0\">&#40;<\/span>i<span class=\"br0\">&#41;<\/span>+<span class=\"st0\">'.html'<\/span><br \/>\n&nbsp; &nbsp; response <span class=\"sy0\">=<\/span> <span class=\"kw3\">urllib2<\/span>.<span class=\"me1\">urlopen<\/span><span class=\"br0\">&#40;<\/span>url<span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; data <span class=\"sy0\">=<\/span> response.<span class=\"me1\">read<\/span><span class=\"br0\">&#40;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; soup <span class=\"sy0\">=<\/span> BeautifulSoup<span class=\"br0\">&#40;<\/span>data<span class=\"sy0\">,<\/span> <span class=\"st0\">&quot;lxml&quot;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; imgs <span class=\"sy0\">=<\/span> soup.<span class=\"me1\">find_all<\/span><span class=\"br0\">&#40;<\/span><span class=\"kw2\">filter<\/span><span class=\"br0\">&#41;<\/span><br \/>\n<br \/>\n&nbsp; &nbsp; <span class=\"kw1\">for<\/span> img <span class=\"kw1\">in<\/span> imgs:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; outfile.<span class=\"me1\">write<\/span><span class=\"br0\">&#40;<\/span>img<span class=\"br0\">&#91;<\/span><span class=\"st0\">'src'<\/span><span class=\"br0\">&#93;<\/span> + <span class=\"st0\">','<\/span> + <span class=\"kw2\">str<\/span><span class=\"br0\">&#40;<\/span><span class=\"kw3\">random<\/span>.<span class=\"me1\">randint<\/span><span class=\"br0\">&#40;<\/span><span class=\"nu0\">0<\/span><span class=\"sy0\">,<\/span> <span class=\"nu0\">1<\/span><span class=\"br0\">&#41;<\/span><span class=\"br0\">&#41;<\/span>+ <span class=\"st0\">'<span class=\"es0\">\\n<\/span>'<\/span><span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; <span class=\"kw3\">time<\/span>.<span class=\"me1\">sleep<\/span><span class=\"br0\">&#40;<\/span><span class=\"nu0\">0.5<\/span><span class=\"br0\">&#41;<\/span><\/div><\/div>\n<p>#\u6700\u540e\u8f93\u51fa\u7684\u65f6\u5019\u5728\u6bcf\u4e2a\u5934\u50cfurl\u540e\u9762\u63a5\u4e86\u4e2a\u968f\u673a\u76840\u6216\u80051\uff0c\u8fd9\u662f\u7528\u968f\u673a\u6570\u6765\u6807\u5fd7\u8fd9\u4e2a\u4eba\u662f\u7537\u751f\u8fd8\u662f\u5973\u751f\u3002<\/p>\n<p>\u4e0b\u9762\u662f\u722c\u53d6\u6635\u79f0\u7684\u722c\u866b\uff1a<\/p>\n<div class=\"codecolorer-container python railscasts\" style=\"overflow:auto;white-space:nowrap;width:100%;height:100%;\"><div class=\"python codecolorer\"><span class=\"co1\"># coding:utf-8<\/span><br \/>\n<span class=\"kw1\">import<\/span> <span class=\"kw3\">sys<\/span><br \/>\n<span class=\"kw2\">reload<\/span><span class=\"br0\">&#40;<\/span><span class=\"kw3\">sys<\/span><span class=\"br0\">&#41;<\/span><br \/>\n<span class=\"kw3\">sys<\/span>.<span class=\"me1\">setdefaultencoding<\/span><span class=\"br0\">&#40;<\/span><span class=\"st0\">&quot;utf-8&quot;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n<br \/>\n<span class=\"kw1\">import<\/span> <span class=\"kw3\">urllib2<\/span><br \/>\n<span class=\"kw1\">from<\/span> bs4 <span class=\"kw1\">import<\/span> BeautifulSoup<br \/>\n<br \/>\n<span class=\"kw1\">def<\/span> <span class=\"kw2\">filter<\/span><span class=\"br0\">&#40;<\/span>tag<span class=\"br0\">&#41;<\/span>:<span class=\"co1\">#\u89e3\u6790\u5305\u542b\u7f51\u540d\u7684\u6807\u7b7e<\/span><br \/>\n&nbsp; &nbsp; <span class=\"kw1\">if<\/span> <span class=\"kw2\">cmp<\/span><span class=\"br0\">&#40;<\/span>tag.<span class=\"me1\">name<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">&quot;ul&quot;<\/span><span class=\"br0\">&#41;<\/span> <span class=\"sy0\">==<\/span> <span class=\"nu0\">0<\/span>:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; <span class=\"kw1\">if<\/span> tag.<span class=\"me1\">has_attr<\/span><span class=\"br0\">&#40;<\/span><span class=\"st0\">&quot;class&quot;<\/span><span class=\"br0\">&#41;<\/span>:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <span class=\"kw1\">if<\/span> <span class=\"kw2\">cmp<\/span><span class=\"br0\">&#40;<\/span>tag<span class=\"br0\">&#91;<\/span><span class=\"st0\">'class'<\/span><span class=\"br0\">&#93;<\/span><span class=\"br0\">&#91;<\/span><span class=\"nu0\">0<\/span><span class=\"br0\">&#93;<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">'list'<\/span><span class=\"br0\">&#41;<\/span> <span class=\"sy0\">==<\/span> <span class=\"nu0\">0<\/span>:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <span class=\"kw1\">return<\/span> <span class=\"kw2\">True<\/span><br \/>\n<br \/>\noutfile <span class=\"sy0\">=<\/span> <span class=\"kw2\">open<\/span><span class=\"br0\">&#40;<\/span><span class=\"st0\">&quot;.\/name.txt&quot;<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">&quot;a&quot;<\/span><span class=\"br0\">&#41;<\/span><span class=\"co1\">#\u8f93\u51fa\u6587\u4ef6<\/span><br \/>\n<br \/>\n<span class=\"kw1\">for<\/span> i <span class=\"kw1\">in<\/span> <span class=\"kw2\">range<\/span><span class=\"br0\">&#40;<\/span><span class=\"nu0\">55<\/span><span class=\"sy0\">,<\/span> <span class=\"nu0\">145<\/span><span class=\"br0\">&#41;<\/span>:<br \/>\n&nbsp; &nbsp; <span class=\"kw1\">print<\/span> i<br \/>\n&nbsp; &nbsp; url <span class=\"sy0\">=<\/span> <span class=\"st0\">'http:\/\/www.oicq88.com\/nvsheng\/'<\/span>+<span class=\"kw2\">str<\/span><span class=\"br0\">&#40;<\/span>i<span class=\"br0\">&#41;<\/span>+<span class=\"st0\">'.htm'<\/span><br \/>\n&nbsp; &nbsp; response <span class=\"sy0\">=<\/span> <span class=\"kw3\">urllib2<\/span>.<span class=\"me1\">urlopen<\/span><span class=\"br0\">&#40;<\/span>url<span class=\"br0\">&#41;<\/span><span class=\"co1\">#\/nvsheng\/\u53ef\u4ee5\u66ff\u6362\u4e3a\u5176\u4ed6\u7684<\/span><br \/>\n&nbsp; &nbsp; data <span class=\"sy0\">=<\/span> response.<span class=\"me1\">read<\/span><span class=\"br0\">&#40;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; soup <span class=\"sy0\">=<\/span> BeautifulSoup<span class=\"br0\">&#40;<\/span>data<span class=\"sy0\">,<\/span> <span class=\"st0\">&quot;lxml&quot;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; ul <span class=\"sy0\">=<\/span> soup.<span class=\"me1\">find_all<\/span><span class=\"br0\">&#40;<\/span><span class=\"kw2\">filter<\/span><span class=\"br0\">&#41;<\/span><br \/>\n<br \/>\n&nbsp; &nbsp; ulsoup <span class=\"sy0\">=<\/span> BeautifulSoup<span class=\"br0\">&#40;<\/span><span class=\"kw2\">str<\/span><span class=\"br0\">&#40;<\/span>ul<span class=\"br0\">&#91;<\/span><span class=\"nu0\">0<\/span><span class=\"br0\">&#93;<\/span><span class=\"br0\">&#41;<\/span><span class=\"sy0\">,<\/span> <span class=\"st0\">&quot;lxml&quot;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n&nbsp; &nbsp; lis <span class=\"sy0\">=<\/span> ulsoup.<span class=\"me1\">find_all<\/span><span class=\"br0\">&#40;<\/span><span class=\"st0\">&quot;li&quot;<\/span><span class=\"br0\">&#41;<\/span><br \/>\n<br \/>\n<br \/>\n&nbsp; &nbsp; <span class=\"kw1\">for<\/span> li <span class=\"kw1\">in<\/span> lis:<br \/>\n&nbsp; &nbsp; &nbsp; &nbsp; outfile.<span class=\"me1\">write<\/span><span class=\"br0\">&#40;<\/span><span class=\"kw2\">str<\/span><span class=\"br0\">&#40;<\/span>li.<span class=\"me1\">p<\/span>.<span class=\"me1\">text<\/span><span class=\"br0\">&#41;<\/span>+<span class=\"st0\">'<span class=\"es0\">\\n<\/span>'<\/span><span class=\"br0\">&#41;<\/span><span class=\"sy0\">&lt;<\/span>\/pre<span class=\"sy0\">&gt;<\/span><\/div><\/div>\n<p>\u53ef\u4ee5\u76f4\u63a5\u590d\u5236\u6e90\u7801\u8fdb\u884c\u722c\u53d6\uff0c\u524d\u63d0\u662f\u201c\u6211\u8981\u4e2a\u6027\u7f51\u201d\u4ee5\u53ca\u201cQQ\u7f51\u540d\u201d\u7684\u524d\u7aef\u9875\u9762\u6ca1\u53d8\uff0c\u4e3a\u4e86\u9632\u6b62\u8fd9\u79cd\u60c5\u51b5\u53d1\u751f\uff0c\u6587\u672b\u6211\u4f1a\u9644\u4e0a\u722c\u53d6\u7684\u6e90\u6587\u4ef6\u94fe\u63a5\uff0c\u5927\u5bb6\u53ef\u4ee5\u81ea\u884c\u4e0b\u8f7d\u3002<\/p>\n<p>\u6587\u4ef6\u4e0b\u8f7d\u94fe\u63a5\uff1a<a href=\"https:\/\/ayonel.malash.net\/files\/name_avatar_over.zip\">https:\/\/ayonel.malash.net\/files\/name_avatar_over.zip<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u4e00\u6b3e\u4ea7\u54c1\u53d1\u5e03\uff0c\u5176\u521d\u59cb\u7528\u6237\u91cf\u5f88\u5c0f\uff0c\u8fd9\u65f6\u5c31\u9700\u8981\u4f2a\u9020\u4e00\u6279\u7528\u6237\u6765\u589e\u52a0\u5e73\u53f0\u7684\u7528\u6237\u91cf\uff0c\u642d\u8d77\u521d\u6b65\u7684\u7528\u6237\u751f\u6001\u7cfb\u7edf\u3002\u90a3\u4e48\u4f2a\u9020\u7528\u6237 [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[5],"tags":[],"class_list":["post-45","post","type-post","status-publish","format-standard","hentry","category-python"],"_links":{"self":[{"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/posts\/45","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/comments?post=45"}],"version-history":[{"count":12,"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/posts\/45\/revisions"}],"predecessor-version":[{"id":72,"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/posts\/45\/revisions\/72"}],"wp:attachment":[{"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/media?parent=45"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/categories?post=45"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ayonel.malash.net\/index.php\/wp-json\/wp\/v2\/tags?post=45"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}