wget.py webをまとめて取得　などなど

10月 29, 2010PCPythonboxheadroom

linuxのwgetコマンドぽいものをPythonで。
svnからソースをまとめてもらってきたいけどsvnがインスコしてないときなどに使います。
（普段のWebの保存にはfirefoxのscrapbook拡張を使ってます）

Vista+Python2.5, 2.6で動作確認

久しぶりに使おうと思ったら、見つけるのに時間がかかったのでblogにもメモ。

wget.pyとして保存

#Reuse as free as public domain.
import urllib2
import xml.etree.ElementTree as etree
from BeautifulSoup import BeautifulSoup
import os
import re

def wget(url,maxdepth=0,depth=0):
    global op,html,u
    print url
    if depth>maxdepth: return    
    op=urllib2.build_opener().open(url)
    html=op.read()
    p,b=mkdir(url)
    print p,"@",b
    if b :
        
        open("%s/%s"%(p,quote(b)),"wb").write(html)            
    else :

        open("%s/%s"%(p,"index.html"),"wb").write(html)
        if depth>=maxdepth:return
        soup=BeautifulSoup(html)
        if not soup : return
        for i in soup.findAll("a"):
            href=i["href"]
            if re.match("https?://",href) :
                continue
            
            u=urllib2.urlparse.urljoin(url, href)
            if u.endswith("/") and u.startswith(url):
                wget(u,depth+1)
            else :
                dl(u)
    
def dl(url):
    p,b=mkdir(url)
    if not b :
        return 
    fn="%s/%s"%(p,b)
    if os.path.exists(fn):
        return
    op=urllib2.build_opener()
    html=op.open(url).read()
    fp=open(fn,"wb")
    fp.write(html)
    fp.close()

def quote(txt):
    if txt.startswith("?"): txt=txt[1:]
    if txt.startswith("."):txt="_"+txt
    txt=urllib2.quote(txt,safe="")
    
    for rep in """\/:*?"<>|""":
        txt=txt.replace(rep,"%"+hex(ord(rep)))
    return txt
def mkdir(fullurl):
    dl_dir =fullurl[:]
    if dl_dir.lower().startswith("http://") :
        dl_dir=dl_dir[7:]
    dlist=dl_dir.split("/")
    dd="."
    for j  in dlist[:-1]:
        
        if not j : continue
        dd="%s/%s"%(dd,quote(j))
        #print dd
        if not os.path.exists(dd):
            os.mkdir(dd)
    
    print dd
    return dd,quote(dlist[-1])
            
            
if __name__=="__main__":
    
    urls=[
#取得したいurl一覧
#例　svnをhttp越しに取得するとき
#url="http://pyglet.googlecode.com/svn/trunk/",　#pyglet
"http://away3d.googlecode.com/svn/trunk/fp10/Away3DLite/"
#away3dlight.lights.Light.as　などが欲しかった
    ]
    for u in urls:
        wget(u,maxdepth=2)

コメントを残すコメントをキャンセル