kaputtnik has proposed merging lp:~widelands-dev/widelands-website/update_beautifulsoup4 into lp:widelands-website.
Commit message: Update BeautifulSoup and make needed changes Requested reviews: Widelands Developers (widelands-dev) For more details, see: https://code.launchpad.net/~widelands-dev/widelands-website/update_beautifulsoup4/+merge/358571 Update BeautifulSoup3 to BeautifulSoup4. This is a prerequisite for the python update. In contrary to bs3, bs4 escapes all (python) strings, so it is not possible anymore to apply just a (unicode)string like "<a href://example.com>LINKTEXT</a>". This results to "<a href://example.com>LINKTEXT</a>" for a BS4 object. This branch takes care of this and modifies the used code to use BeautifulSoup4 objects. The new code may can be smarter, but i find it understandable. I have also refactored some variables and comments. The rendering times are close to equal in comparison with BeautifulSoup3. E.g. For the Developers page: /developers/ bs3: ~0.62s bs4: ~0.45s For /wiki/WikiSyntax/ bs3: ~0.14s bs4: ~0.14s The Regular expression for finding pasted plain text-links is tested here: https://regexr.com/42pq5 I have also removed the SMILEY_PREESCAPING things, because it works as is right now. The only problem is: The 'develish' smiley won't work if it is placed as the first characters. I am in favor to replace '>:-)' with ']:-)' to fix this. Any remarks for this? -- Your team Widelands Developers is requested to review the proposed merge of lp:~widelands-dev/widelands-website/update_beautifulsoup4 into lp:widelands-website.
=== modified file 'mainpage/templatetags/wl_markdown.py' --- mainpage/templatetags/wl_markdown.py 2017-11-14 16:54:28 +0000 +++ mainpage/templatetags/wl_markdown.py 2018-11-09 18:07:36 +0000 @@ -25,7 +25,7 @@ import urllib import bleach -from BeautifulSoup import BeautifulSoup, NavigableString +from bs4 import BeautifulSoup, NavigableString # If we can import a Wiki module with Articles, we # will check for internal wikipages links in all internal @@ -38,8 +38,7 @@ # We will also need the site domain from django.contrib.sites.models import Site -from settings import SITE_ID, SMILEYS, SMILEY_DIR, \ - SMILEY_PREESCAPING +from settings import SITE_ID, SMILEYS, SMILEY_DIR try: _domain = Site.objects.get(pk=SITE_ID).domain @@ -60,42 +59,55 @@ def _insert_smileys(text): """This searches for smiley symbols in the current text and replaces them with the correct images. - - Only replacing if smiley symbols aren't in a word (e.g. http://....) - - """ - words = text.split(' ') - for sc, img in SMILEYS: - if sc in words: - words[words.index( - sc)] = "<img src='%s%s' alt='%s' />" % (SMILEY_DIR, img, img) - text = ' '.join(words) - return text - - -def _insert_smiley_preescaping(text): - """This searches for smiley symbols in the current text and replaces them - with the correct images.""" - for before, after in SMILEY_PREESCAPING: - text = text.replace(before, after) - return text + + Then we have to reassemble the whole contents...""" + + tmp_content = [] + for content in text.parent.contents: + try: + # If this fails, content is probably '\n' or not a string, e.g. <br /> + words = content.split(' ') + except: + # apply the unsplittable content and continue + tmp_content.append(content) + continue + + for i, word in enumerate(words): + smiley = "" + for sc, img in SMILEYS: + if word == sc: + smiley = img + if smiley: + img_tag = BeautifulSoup(features="lxml").new_tag('img') + img_tag['src'] = "{}{}".format(SMILEY_DIR, smiley) + img_tag['alt'] = smiley + tmp_content.append(img_tag) + # Apply a space after the smiley + tmp_content.append(NavigableString(' ')) + else: + if i < (len(words) - 1): + # Apply a space after each word, except the last word + word = word + ' ' + tmp_content.append(NavigableString(word)) + + text.parent.contents = [x for x in tmp_content] def _classify_link(tag): - """Returns a classname to insert if this link is in any way special + """Applies a classname if this link is in any way special (external or missing wikipages) - tag to classify for + tag: classify for this tag """ # No class change for image links - if tag.findChild('img') != None: - return None + if tag.next_element.name == 'img': + return try: href = tag['href'].lower() except KeyError: - return None + return # Check for external link if href.startswith('http'): @@ -105,67 +117,93 @@ external = False break if external: - return {'class': 'externalLink', 'title': 'This link refers to outer space'} + tag['class'] = "externalLink" + tag['title'] = "This link refers to outer space" + return if '/profile/' in (tag['href']): - return {'class': 'userLink', 'title': 'This link refers to a userpage'} + tag['class'] = "userLink" + tag['title'] = "This link refers to a userpage" + return if check_for_missing_wikipages and href.startswith('/wiki/'): # Check for missing wikilink /wiki/PageName[/additionl/stuff] # Using href because we need cAsEs here - pn = urllib.unquote(tag['href'][6:].split('/', 1)[0]) + article_name = urllib.unquote(tag['href'][6:].split('/', 1)[0]) - if not len(pn): # Wiki root link is not a page - return {'class': 'wrongLink', 'title': 'This Link misses an articlename'} + if not len(article_name): # Wiki root link is not a page + tag['class'] = "wrongLink" + tag['title'] = "This Link misses an articlename" + return # Wiki special pages are also not counted - if pn in ['list', 'search', 'history', 'feeds', 'observe', 'edit']: - return {'class': 'specialLink'} + if article_name in ['list', 'search', 'history', 'feeds', 'observe', 'edit']: + tag['class'] = "specialLink" + return # Check for a redirect try: # try to get the article id; if this fails an IndexError is raised a_id = ChangeSet.objects.filter( - old_title=pn).values_list('article_id')[0] + old_title=article_name).values_list('article_id')[0] # get actual title of article act_t = Article.objects.get(id=a_id[0]).title - if pn != act_t: - return {'title': "This is a redirect and points to \"" + act_t + "\""} + if article_name != act_t: + tag['title'] = "This is a redirect and points to \"" + act_t + "\"" + return else: - return None + return except IndexError: pass # article missing (or misspelled) - if Article.objects.filter(title=pn).count() == 0: - return {'class': 'missingLink', 'title': 'This Link is misspelled or missing. Click to create it anyway.'} - - return None - - -def _clickable_image(tag): + if Article.objects.filter(title=article_name).count() == 0: + tag['class'] = "missingLink" + tag['title'] = "This Link is misspelled or missing. Click to create it anyway." + return + return + + +def _make_clickable_images(tag): # is external link? if tag['src'].startswith('http'): - # is allways a link? + # Do not change if it is allready a link if tag.parent.name != 'a': # add link to image - text = '<a href=' + tag['src'] + \ - '><img src=' + tag['src'] + '></a>' - return text - return None - + new_link = BeautifulSoup(features="lxml").new_tag('a') + new_link['href'] = tag['src'] + new_img = BeautifulSoup(features="lxml").new_tag('img') + new_img['src'] = tag['src'] + new_img['alt'] = tag['alt'] + new_link.append(new_img) + tag.replace_with(new_link) + return + + +def find_smiley_Strings(bs4_string): + """Find strings that contain a smiley symbol""" + + if bs4_string.parent.name.lower() == 'code': + return False + + #for element in bs4_string.parent.contents: + for sc in SMILEYS: + if sc[0] in bs4_string: + return True + return False # Predefine the markdown extensions here to have a clean code in # do_wl_markdown() md_extensions = ['extra', 'toc', SemanticWikiLinkExtension()] def do_wl_markdown(value, *args, **keyw): - # Do Preescaping for markdown, so that some things stay intact - # This is currently only needed for this smiley ">:-)" - value = _insert_smiley_preescaping(value) - custom = keyw.pop('custom', True) + """Apply wl specific things, like smileys or colored links. + + If something get modified, it is mostky done directly in the subfunctions""" + + beautify = keyw.pop('beautify', True) html = smart_str(markdown(value, extensions=md_extensions)) # Sanitize posts from potencial untrusted users (Forum/Wiki/Maps) @@ -173,49 +211,29 @@ html = mark_safe(bleach.clean( html, tags=BLEACH_ALLOWED_TAGS, attributes=BLEACH_ALLOWED_ATTRIBUTES)) - # Since we only want to do replacements outside of tags (in general) and not between - # <a> and </a> we partition our site accordingly - # BeautifoulSoup does all the heavy lifting - soup = BeautifulSoup(html) + # Prepare the html and apply smileys and classes. + # BeautifulSoup objects are all references, so changing a variable + # derived from the soup will take effect on the soup itself. + # Because of that the called functions will modify the soup directly. + soup = BeautifulSoup(html, features="lxml") if len(soup.contents) == 0: # well, empty soup. Return it return unicode(soup) - for text in soup.findAll(text=True): - # Do not replace inside a link - if text.parent.name == 'a': - continue - - # We do our own small preprocessing of the stuff we got, after markdown - # went over it General consensus is to avoid replacing anything in - # links [blah](blkf) - if custom: - rv = text - # Replace smileys; only outside "code-tags" - if not text.parent.name == 'code': - rv = _insert_smileys(rv) - - text.replaceWith(rv) - - # This call slows the whole function down... - # unicode->reparsing. - # The function goes from .5 ms to 1.5ms on my system - # Well, for our site with it's little traffic it's maybe not so important... - # What a waste of cycles :( - soup = BeautifulSoup(unicode(soup)) - # We have to go over this to classify links - for tag in soup.findAll('a'): - rv = _classify_link(tag) - if rv: - for attribute in rv.iterkeys(): - tag[attribute] = rv.get(attribute) - - # All external images gets clickable - # This applies only in forum - for tag in soup.findAll('img'): - link = _clickable_image(tag) - if link: - tag.replaceWith(link) + if beautify: + # Insert smileys + smiley_text = soup.find_all(string=find_smiley_Strings) + for text in smiley_text: + _insert_smileys(text) + + # Classify links + for tag in soup.find_all('a'): + _classify_link(tag) + + # All external images gets clickable + # This applies only in forum + for tag in soup.find_all('img'): + _make_clickable_images(tag) return unicode(soup) === modified file 'mainpage/views.py' --- mainpage/views.py 2018-04-03 05:18:03 +0000 +++ mainpage/views.py 2018-11-09 18:07:36 +0000 @@ -126,7 +126,7 @@ except IOError: txt = txt + "Couldn't find developer file!" - txt = do_wl_markdown(txt, custom=False) + txt = do_wl_markdown(txt, beautify=False) return render(request, 'mainpage/developers.html', {'developers': txt} === modified file 'pip_requirements.txt' --- pip_requirements.txt 2018-10-03 11:03:43 +0000 +++ pip_requirements.txt 2018-11-09 18:07:36 +0000 @@ -1,6 +1,6 @@ # Python requirements for widelands-website at 22.06.2017 -BeautifulSoup==3.2.0 +beautifulsoup4==4.6.3 Django==1.11.12 django-haystack==2.8.1 # django-messages is very old on pypi @@ -11,6 +11,7 @@ django-registration==2.4.1 django-tagging==0.4.5 gunicorn==19.7.1 +lxml==4.2.5 Markdown==2.6.8 mysqlclient==1.3.10 numpy==1.13.0 === modified file 'pybb/util.py' --- pybb/util.py 2018-10-01 16:41:29 +0000 +++ pybb/util.py 2018-11-09 18:07:36 +0000 @@ -2,8 +2,9 @@ import random import traceback import json +import re -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString from datetime import datetime from django.shortcuts import render from django.http import HttpResponse @@ -11,7 +12,6 @@ from django.utils.translation import check_for_language from django.utils.encoding import force_unicode from django import forms -from django.template.defaultfilters import urlize as django_urlize from django.core.paginator import Paginator, EmptyPage, InvalidPage from django.conf import settings from pybb import settings as pybb_settings @@ -145,6 +145,16 @@ return form +PLAIN_LINK_RE = re.compile(r'(http[s]?:\/\/[-a-zA-Z0-9@:%._\+~#=/?]+)') +def exclude_code_tag(bs4_string): + if bs4_string.parent.name == 'code': + return False + m = PLAIN_LINK_RE.search(bs4_string) + if m: + return True + return False + + def urlize(data): """Urlize plain text links in the HTML contents. @@ -152,18 +162,29 @@ """ - soup = BeautifulSoup(data) - for chunk in soup.findAll(text=True): - islink = False - ptr = chunk.parent - while ptr.parent: - if ptr.name == 'a' or ptr.name == 'code': - islink = True - break - ptr = ptr.parent - if not islink: - # Using unescape to prevent conversation of f.e. > to &gt; - chunk = chunk.replaceWith(django_urlize(unicode(unescape(chunk)))) + soup = BeautifulSoup(data, 'lxml') + for found_string in soup.find_all(string=exclude_code_tag): + new_content = [] + strings_or_tags = found_string.parent.contents + for string_or_tag in strings_or_tags: + try: + for string in PLAIN_LINK_RE.split(string_or_tag): + if string.startswith('http'): + # Apply an a-Tag + tag = soup.new_tag('a') + tag['href'] = string + tag.string = string + tag['nofollow'] = 'true' + new_content.append(tag) + else: + # This is just a string, apply a bs4-string + new_content.append(NavigableString(string)) + except: + # Regex failed, so apply what ever it is + new_content.append(string_or_tag) + + # Apply the new content + found_string.parent.contents = new_content return unicode(soup) === modified file 'settings.py' --- settings.py 2018-10-05 19:10:18 +0000 +++ settings.py 2018-11-09 18:07:36 +0000 @@ -213,8 +213,7 @@ (':))', 'face-smile-big.png'), (':-)', 'face-smile.png'), (':)', 'face-smile.png'), - # Hack around markdown replacement. see also SMILEY_PREESCAPING - ('>:-)', 'face-devilish.png'), + ('>:-)', 'face-devilish.png'), ('8-)', 'face-glasses.png'), ('8)', 'face-glasses.png'), (':-D', 'face-grin.png'), @@ -243,10 +242,6 @@ (';-)', 'face-wink.png'), (';)', 'face-wink.png'), ] -# This needs to be done to keep some stuff hidden from markdown -SMILEY_PREESCAPING = [ - ('>:-)', '\>:-)') -] ################# # Search Config #
_______________________________________________ Mailing list: https://launchpad.net/~widelands-dev Post to : widelands-dev@lists.launchpad.net Unsubscribe : https://launchpad.net/~widelands-dev More help : https://help.launchpad.net/ListHelp