Update of /cvsroot/audacity/audacity-src/scripts/mw2html_audacity
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv24392
Modified Files:
mw2html.py sidebar.html wiki2htm.bat wiki2htm.sh
Log Message:
Fixing the dump script. Added error counter. Replaced deprecated sha module for
hashlib. Fixed sidebar.html links (they have to be related to the online wiki
and not the local dump). Fixed batch file so it doesn't retrieve a access
denied when the destination folder is not empty.
Index: wiki2htm.sh
===================================================================
RCS file: /cvsroot/audacity/audacity-src/scripts/mw2html_audacity/wiki2htm.sh,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- wiki2htm.sh 26 May 2009 20:08:52 -0000 1.1
+++ wiki2htm.sh 29 May 2009 04:04:20 -0000 1.2
@@ -6,5 +6,5 @@
mkdir -p "${tmpdir_}"
python mw2html.py "${srcuri}" "${tmpdir_}" -s
-mv "${tmpdir_}/audacityteam.org/manual" "${helpdir}"
+mv "${tmpdir_}/audacityteam.org" "${helpdir}"
\rm -r "${tmpdir_}"
Index: wiki2htm.bat
===================================================================
RCS file: /cvsroot/audacity/audacity-src/scripts/mw2html_audacity/wiki2htm.bat,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- wiki2htm.bat 26 May 2009 20:08:57 -0000 1.2
+++ wiki2htm.bat 29 May 2009 04:04:20 -0000 1.3
@@ -1,4 +1,6 @@
python mw2html.py http://audacityteam.org/manual ..\..\help\temp -s
-move ..\..\help\temp\audacityteam.org\manual ..\..\help\manual
-rmdir ..\..\help\temp
+rmdir /S /Q ..\..\help\manual
+mkdir ..\..\help\manual
+move ..\..\help\temp\audacityteam.org ..\..\help\manual
+rmdir /S /Q ..\..\help\temp
Index: sidebar.html
===================================================================
RCS file: /cvsroot/audacity/audacity-src/scripts/mw2html_audacity/sidebar.html,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- sidebar.html 24 May 2009 21:35:34 -0000 1.1
+++ sidebar.html 29 May 2009 04:04:20 -0000 1.2
@@ -1,18 +1,18 @@
<div class="portlet" id="p-logo">
- <a style="background-image:
url(images/new_audacitytransmediawikimanual.png);" href="index.html"
title="Main Page"></a>
+ <a style="background-image:
url(http://audacityteam.org/images/NEW_AudacityTransMediaWikiMANUAL.png);"
href="http://audacityteam.org/manual" title="Main Page"></a>
</div>
<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
<div class="portlet" id="p-tb"> </div>
<ul>
- <li><a href="index.html#Tutorials">Tutorials</a></li>
- <li><a href="index.html#Using_Audacity">Using Audacity</li>
- <li><a href="index.html#Reference">Reference</a></li>
+ <li><a href="/manual/index.php?title=Main_Page#Tutorials">Tutorials</a></li>
+ <li><a href="/manual/index.php?title=Main_Page#Using_Audacity">Using
Audacity</li>
+ <li><a href="/manual/index.php?title=Main_Page#Reference">Reference</a></li>
<ul>
- <li><a href="menu_reference.html">Menu Bar</a></li>
- <li><a href="index.html#Toolbars">Toolbars</a></li>
- <li><a href="index.html#Project_Window">Project Window</li>
- <li><a href="preferences.html">Preferences</a></li>
+ <li><a href="/manual/index.php?title=Menu_Reference">Menu Bar</a></li>
+ <li><a
href="/manual/index.php?title=Main_Page#Toolbars:_buttons_and_controls_for_playback.2Crecording_and_editing">Toolbars</a></li>
+ <li><a href="/manual/index.php?title=Main_Page#Project_Window:">Project
Window</li>
+ <li><a href="/manual/index.php?title=Preferences">Preferences</a></li>
</ul>
- <li><a href="faq.html">FAQ</a></li>
- <li><a href="glossary.html">Glossary</a></li>
+ <li><a href="/manual/index.php?title=FAQ">FAQ</a></li>
+ <li><a href="/manual/index.php?title=Glossary">Glossary</a></li>
</ul>
Index: mw2html.py
===================================================================
RCS file: /cvsroot/audacity/audacity-src/scripts/mw2html_audacity/mw2html.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- mw2html.py 24 May 2009 21:35:34 -0000 1.1
+++ mw2html.py 29 May 2009 04:04:20 -0000 1.2
@@ -28,7 +28,7 @@
import urlparse
import os, os.path
import errno
-import sha
+import hashlib
import httplib
#import pdb
from time import strftime
@@ -49,10 +49,12 @@
MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->'
INDEX_HTML = 'index.html'
url_filename_cache = {}
+redir_cache = {}
wrote_file_set = set()
sidebar_content = ''
footer_text = ''
counter = 0
+errors = 0
conn = None
domain = ''
@@ -63,7 +65,7 @@
Instances contain all options passed at the command line.
"""
def __init__(self, rooturl, outdir,
- flatten=True, lower=True, index=None, clean=True,
+ flatten=True, index=None, clean=True,
sidebar=None, hack_skin=True,
made_by=True, overwrite=False, footer=None,
skin=MONOBOOK_SKIN, move_href=True,
@@ -72,7 +74,6 @@
self.rooturl = rooturl
self.outdir = os.path.abspath(outdir)
self.flatten = flatten
- self.lower = lower
self.index = index
self.clean = clean
self.sidebar = sidebar
@@ -154,7 +155,7 @@
if config.made_by:
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
- SIDEBAR_ID = 'SIDEBAR' + sha.new(str(random.random())).hexdigest()
+ SIDEBAR_ID = 'SIDEBAR' + hashlib.md5(str(random.random())).hexdigest()
# Remove sidebar HTML
doc = re.sub(
@@ -443,7 +444,7 @@
if os.path.splitext(tail)[1] == '.png':
tail = os.path.splitext(tail)[0]
if set(tail) <= set('0123456789abcdef') and len(tail) == 32:
- ans = 'math_' + sha.new(tail).hexdigest()[:4] + '.png'
+ ans = 'math_' + hashlib.md5(tail).hexdigest()[:4] + '.png'
return os.path.join(par, ans)
def flatten_filename(url, config, filename):
@@ -471,44 +472,73 @@
def url_open(url):
# download a file and retrieve its content and mimetype
- global conn, domain, counter
+ global conn, domain, counter, redir_cache, errors
- redirect = 'foo'
+ l_redir = []
+ redirect = url
while redirect != '':
- pos = url.find(domain)
- if pos == -1:
- print 'ERROR: url', url, 'is in a different domain.'
- return ('', '')
+ l_redir += [url]
- rel_url = url[pos+len(domain):]
+ rel_url = url
+ pos = url.find(domain)
+ if pos != -1:
+ rel_url = url[pos+len(domain):]
attempts = 0
- while attempts < 2:
+ #number of attempts
+ total_attempts = 3
+ recovered = False
+ success = False
+
+ while not success and attempts < total_attempts:
+ #increment httplib requests counter
counter += 1
- conn.request("GET", rel_url)
try:
+ conn.request("GET", rel_url)
r = conn.getresponse()
print 'Status',r.status,r.reason,'accessing',rel_url
if r.status == 404:
+ errors += 1
return ('','')
- attempts += 2
+ if r.status == 500:
+ print "eventually this error might be recovered. let's try again."
+ print 'reconnecting...'
+ conn = httplib.HTTPConnection(domain)
+ attempts += 1
+ continue
+ if attempts != 0:
+ recovered = True
+ success = True
+
except httplib.HTTPException, e:
print 'ERROR',e.__class__.__name__,'while retrieving', url
conn.close
- if e.__class__.__name__ in ['BadStatusLine',
'ImproperConnectionState', 'NotConnected', 'IncompleteRead']:
+ if e.__class__.__name__ in ['BadStatusLine',
'ImproperConnectionState', 'NotConnected', 'IncompleteRead',
'ResponseNotReady']:
print "eventually this error might be recovered. let's try again."
print 'reconnecting...'
conn = httplib.HTTPConnection(domain)
attempts += 1
else:
+ sys.exit()
+ errors += 1
return ('','')
- if attempts == 3:
+ if recovered:
print "error recovered"
-
+
+ if not success:
+ errors += 1
+ return ('', '')
+
redirect = r.getheader('Location', '').split(';')[0]
- url = redirect
- doc = r.read()
+
+ if redirect != "":
+ url = redirect
+ else:
+ doc = r.read()
+
+ for item in l_redir:
+ redir_cache[normalize_url(item)] = normalize_url(url)
mimetype = r.getheader('Content-Type', '').split(';')[0].lower()
@@ -520,19 +550,22 @@
Transforms web url into local url and caches it.
Downloads the file to disk and works with it there instead of download the
same file two times (Performance Improvement).
"""
+ if get_domain(url) != domain:
+ url = normalize_url(urlparse.urljoin(config.rooturl, url))
+
url = split_section(url)[0]
- if url in url_filename_cache:
- return url_filename_cache[url]
+ nurl = normalize_url(url)
- part = normalize_url(url)
+ if nurl in url_filename_cache:
+ return url_filename_cache[nurl]
#ParseResult(scheme='http', netloc='www.cwi.nl:80',
path='/%7Eguido/Python.html', params='', query='', fragment='')
L = list(urlparse.urlparse(url))
L[2] = L[2].strip('/')
-
- if not '.' in L[2]:
+ lpath = L[2].split('/')
+ if not '.' in lpath[-1]:
# url ends with a directory name. Store it under index.html.
L[2] += '/' + INDEX_HTML
else:
@@ -560,7 +593,7 @@
(doc, mimetype) = url_open(url)
if doc == '' or mimetype == '':
- url_filename_cache[url] = ''
+ url_filename_cache[nurl] = ''
return ''
# Fix up extension based on mime type.
@@ -577,8 +610,7 @@
ext = '.' + MIME_MAP[mimetype]
subfile = root + ext
- if config.lower:
- subfile = subfile.lower()
+ subfile = subfile.lower()
ans = os.path.join(config.outdir, subfile)
@@ -595,7 +627,7 @@
# Cache and return answer.
wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
- url_filename_cache[url] = ans
+ url_filename_cache[nurl] = ans
mode = ['wb', 'w'][mimetype.startswith('text')]
@@ -615,7 +647,7 @@
f = open(ans, mode)
f.write(doc)
f.close()
-
+
return ans
def url_to_relative(url, cururl, config):
@@ -625,7 +657,7 @@
"""
cururl = split_section(cururl)[0]
(url, section) = split_section(url)
-
+
L1 = url_to_filename(url, config).replace(os.sep, '/').split('/')
if L1 == '':
return ''
@@ -670,17 +702,26 @@
'url' should be spidered as well.
"""
# False if different domains.
+ url = urlparse.urljoin(config.rooturl, url)
if get_domain(config.rooturl) != get_domain(url):
+ if config.debug:
+ print url, 'not in the same domain'
return False
# False if multiple query fields or parameters found
if url.count('&') >= 1 or url.count(';') > 0:
+ if config.debug:
+ print url, 'with multiple query fields'
return False
if any(x in url for x in ('MediaWiki:', 'Special:', 'Image:', 'Talk:',
'User:', 'Help:')):
+ if config.debug:
+ print url, 'is a forbidden wiki page'
return False
if config.no_images and any(url.strip().lower().endswith(suffix) for suffix
in ('.jpg', '.gif', '.png', '.ico')):
+ if config.debug:
+ print url, 'is a image and you are in no-images mode'
return False
# limit_parent support
@@ -690,11 +731,15 @@
if config.limit_parent and not nurl.startswith(ncurl):
L = nurl.split('/')
if ('.' not in L[-1]):
+ if config.debug:
+ print url, 'is a file outside of scope with unknown extension'
return False
forbidden_parents = ['.php','.html','.htm']
for fp in forbidden_parents:
if fp in L[-1]:
+ if config.debug:
+ print url, 'is a page outside of scope'
return False
return True
@@ -710,13 +755,13 @@
new_urls = []
doc = pre_html_transform(doc, url, config)
-
# Temporarily "get rid" of comments so htmldata will find the URLs
# in the funky "<!--[if" HTML hackery for IE.
doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
doc = doc.replace('-->', END_COMMENT_REPLACE)
-
+
L = htmldata.urlextract(doc, url, 'text/html')
+
for item in L:
u = item.url
follow = should_follow(u, config)
@@ -743,7 +788,7 @@
"""
Code interface.
"""
- global conn, domain, counter
+ global conn, domain, counter, redir_cache
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
out.write('Please do not use robots with the Wikipedia site.\n')
@@ -758,22 +803,41 @@
out.write('Error: Directory exists: ' + str(config.outdir) )
sys.exit(1)
- #pdb.set_trace()
domain = get_domain(config.rooturl)
conn = httplib.HTTPConnection(domain)
print 'connection established to:', domain
complete = set()
pending = set([config.rooturl])
-
- while len(pending) > 0:
- url = pending.pop()
- if normalize_url(url) in complete:
+ start = True
+ while len(pending) > 0:
+ url = normalize_url(pending.pop())
+ if get_domain(url) != domain:
+ url = normalize_url(urlparse.urljoin(config.rooturl, url))
+
+ if url in redir_cache:
+ url = normalize_url(redir_cache[url])
+
+ if url in complete:
+ if config.debug:
+ print url, 'already processed'
continue
- complete.add(normalize_url(url))
-
+ complete.add(url)
filename = url_to_filename(url, config)
+
+ #this is needed for the first path as it doesn't know if it is a redirect
or not in the begining
+ #at this point all the content of redir_cache is relative to the start path
+ if start:
+ start = False
+ nurl = ''
+ for red in redir_cache.iterkeys():
+ nurl = normalize_url(red)
+ url_filename_cache[nurl] = filename
+ if nurl not in complete:
+ complete.add(nurl)
+ if nurl != '':
+ url = normalize_url(redir_cache[url])
if filename == '':
continue
@@ -785,14 +849,13 @@
f = open(filename, 'r')
doc = f.read()
f.close()
-
new_urls = []
if filename.endswith('.html'):
(doc, new_urls) = parse_html(doc, url, config)
elif filename.endswith('.css'):
(doc, new_urls) = parse_css(doc, url, config)
-
+
# Enqueue URLs that we haven't yet spidered.
for u in new_urls:
if normalize_url(u) not in complete:
@@ -800,7 +863,7 @@
if '#' in u:
u = u[:u.index('#')]
pending.add(u)
-
+
# Save document changes to disk
update = False
text_ext = ( 'txt', 'html', 'rtf', 'css', 'sgml', 'xml' )
@@ -822,6 +885,7 @@
print "connection to",domain,"closed."
out.write(str(n) + ' files saved\n')
print counter, "httplib requests done"
+ print errors, "errors not recovered"
def usage():
@@ -844,7 +908,6 @@
-b footer.html, keeps MediaWiki icon and more
design changes.
--no-flatten - Do not flatten directory structure.
- --no-lower - Retain original case for output filenames and dirs.
--no-clean - Do not clean up filenames (clean replaces
non-alphanumeric chars with _, renames math thumbs).
--no-hack-skin - Do not modify skin CSS and HTML for looks.
@@ -886,7 +949,7 @@
"""
try:
(opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fsdl:t:b:i:',
- ['force', 'no-flatten', 'no-lower', 'no-clean',
+ ['force', 'no-flatten', 'no-clean',
'no-hack-skin', 'no-made-by', 'left=',
'top=', 'bottom=', 'index=', 'no-move-href',
'no-remove-png', 'no-remove-history', 'limit-parent',
@@ -907,8 +970,6 @@
config.overwrite = True
if opt in ['--no-flatten', '-s', '-special-mode']:
config.flatten = False
- if opt in ['--no-lower']:
- config.lower = False
if opt in ['--no-clean']:
config.clean = False
if opt in ['--no-hack-skin']:
------------------------------------------------------------------------------
Register Now for Creativity and Technology (CaT), June 3rd, NYC. CaT
is a gathering of tech-side developers & brand creativity professionals. Meet
the minds behind Google Creative Lab, Visual Complexity, Processing, &
iPhoneDevCamp as they present alongside digital heavyweights like Barbarian
Group, R/GA, & Big Spaceship. http://p.sf.net/sfu/creativitycat-com
_______________________________________________
Audacity-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/audacity-cvs