bs4 into lp:openlp

Andreas Preikschat Wed, 17 Apr 2013 11:38:31 -0700

Andreas Preikschat has proposed merging lp:~googol/openlp/bs4 into lp:openlp.


Requested reviews:
  OpenLP Core (openlp-core)

For more details, see:
https://code.launchpad.net/~googol/openlp/bs4/+merge/159468

Hello,

- ported code to beautifulsoup 4
- moved regular expressions to the top
- fixed regression

Function tests worked fine.
I could not test the interface tests because bs4 is not installed on the server 
and the interface tests fails here (SegFault).
-- 
https://code.launchpad.net/~googol/openlp/bs4/+merge/159468
Your team OpenLP Core is requested to review the proposed merge of 
lp:~googol/openlp/bs4 into lp:openlp.

=== modified file 'openlp/core/ui/exceptionform.py'
--- openlp/core/ui/exceptionform.py	2013-04-02 20:52:31 +0000
+++ openlp/core/ui/exceptionform.py	2013-04-17 18:37:26 +0000
@@ -35,7 +35,7 @@
 import platform
 
 import sqlalchemy
-import BeautifulSoup
+from bs4 import BeautifulSoup
 from lxml import etree
 from PyQt4 import Qt, QtCore, QtGui, QtWebKit
 

=== modified file 'openlp/plugins/bibles/lib/http.py'
--- openlp/plugins/bibles/lib/http.py	2013-03-07 08:05:43 +0000
+++ openlp/plugins/bibles/lib/http.py	2013-04-17 18:37:26 +0000
@@ -36,7 +36,7 @@
 import urllib
 from HTMLParser import HTMLParseError
 
-from BeautifulSoup import BeautifulSoup, NavigableString, Tag
+from bs4 import BeautifulSoup, NavigableString, Tag
 
 from openlp.core.lib import Registry, translate
 from openlp.core.lib.ui import critical_error_message_box
@@ -44,6 +44,9 @@
 from openlp.plugins.bibles.lib import SearchResults
 from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
 
+CLEANER_REGEX = re.compile(r'&nbsp;|<br />|\'\+\'')
+FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
+REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
 UGLY_CHARS = {
     u'\u2014': u' - ',
     u'\u2018': u'\'',
@@ -52,9 +55,12 @@
     u'\u201d': u'"',
     u'&nbsp;': u' '
 }
+VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
+
 
 log = logging.getLogger(__name__)
 
+
 class BGExtract(object):
     """
     Extract verses from BibleGateway
@@ -78,9 +84,9 @@
             An HTML class attribute for further qualification.
         """
         if class_:
-            all_tags = parent.findAll(tag, class_)
+            all_tags = parent.find_all(tag, class_)
         else:
-            all_tags = parent.findAll(tag)
+            all_tags = parent.find_all(tag)
         for element in all_tags:
             element.extract()
 
@@ -93,14 +99,15 @@
         """
         if isinstance(tag, NavigableString):
             return None, unicode(tag)
-        elif tag.get('class') == 'versenum' or tag.get('class') == 'versenum mid-line':
+        elif tag.get('class')[0] == "versenum" or tag.get('class')[0] == 'versenum mid-line':
             verse = unicode(tag.string).replace('[', '').replace(']', '').strip()
             return verse, None
-        elif tag.get('class') == 'chapternum':
+        elif tag.get('class')[0] == 'chapternum':
             verse = '1'
             return verse, None
         else:
-            verse, text = None, ''
+            verse = None
+            text = ''
             for child in tag.contents:
                 c_verse, c_text = self._extract_verse(child)
                 if c_verse:
@@ -137,7 +144,8 @@
         tags = tags[::-1]
         current_text = ''
         for tag in tags:
-            verse, text = None, ''
+            verse = None
+            text = ''
             for child in tag.contents:
                 c_verse, c_text = self._extract_verse(child)
                 if c_verse:
@@ -173,8 +181,8 @@
 
     def _extract_verses_old(self, div):
         """
-        Use the old style of parsing for those Bibles on BG who mysteriously
-        have not been migrated to the new (still broken) HTML.
+        Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
+        broken) HTML.
 
         ``div``
             The parent div.
@@ -185,13 +193,12 @@
         if first_verse and first_verse.contents:
             verse_list[1] = unicode(first_verse.contents[0])
         for verse in div(u'sup', u'versenum'):
-            raw_verse_num = verse.next
+            raw_verse_num = verse.next_element
             clean_verse_num = 0
-            # Not all verses exist in all translations and may or may not be
-            # represented by a verse number. If they are not fine, if they are
-            # it will probably be in a format that breaks int(). We will then
-            # have no idea what garbage may be sucked in to the verse text so
-            # if we do not get a clean int() then ignore the verse completely.
+            # Not all verses exist in all translations and may or may not be represented by a verse number. If they are
+            # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
+            # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
+            # completely.
             try:
                 clean_verse_num = int(str(raw_verse_num))
             except ValueError:
@@ -201,16 +208,16 @@
             except TypeError:
                 log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
             if clean_verse_num:
-                verse_text = raw_verse_num.next
-                part = raw_verse_num.next.next
-                while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'):
+                verse_text = raw_verse_num.next_element
+                part = raw_verse_num.next_element.next_element
+                while not (isinstance(part, Tag) and part.get(u'class')[0] == u'versenum'):
                     # While we are still in the same verse grab all the text.
                     if isinstance(part, NavigableString):
                         verse_text += part
-                    if isinstance(part.next, Tag) and part.next.name == u'div':
+                    if isinstance(part.next_element, Tag) and part.next_element.name == u'div':
                         # Run out of verses so stop.
                         break
-                    part = part.next
+                    part = part.next_element
                 verse_list[clean_verse_num] = unicode(verse_text)
         return verse_list
 
@@ -230,15 +237,14 @@
         log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter)
         url_book_name = urllib.quote(book_name.encode("utf-8"))
         url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version)
-        cleaner = [(re.compile('&nbsp;|<br />|\'\+\''), lambda match: '')]
         soup = get_soup_for_bible_ref(
             u'http://www.biblegateway.com/passage/?%s' % url_params,
-            pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='', cleaner=cleaner)
+            pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
         if not soup:
             return None
         div = soup.find('div', 'result-text-style-normal')
         self._clean_soup(div)
-        span_list = div.findAll('span', 'text')
+        span_list = div.find_all('span', 'text')
         log.debug('Span list: %s', span_list)
         if not span_list:
             # If we don't get any spans then we must have the old HTML format
@@ -282,7 +288,7 @@
         self.application.process_events()
         content = soup.find(u'table', u'infotable')
         if content:
-            content = content.findAll(u'tr')
+            content = content.find_all(u'tr')
         if not content:
             log.error(u'No books found in the Biblegateway response.')
             send_error_message(u'parse')
@@ -341,19 +347,17 @@
             log.error(u'No verses found in the Bibleserver response.')
             send_error_message(u'parse')
             return None
-        content = content.find(u'div').findAll(u'div')
-        verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
+        content = content.find(u'div').find_all(u'div')
         verses = {}
         for verse in content:
             self.application.process_events()
-            versenumber = int(verse_number.sub(r'\3', verse[u'class']))
+            versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', u' '.join(verse[u'class'])))
             verses[versenumber] = verse.contents[1].rstrip(u'\n')
         return SearchResults(book_name, chapter, verses)
 
     def get_books_from_http(self, version):
         """
-        Load a list of all books a Bible contains from Bibleserver mobile
-        website.
+        Load a list of all books a Bible contains from Bibleserver mobile website.
 
         ``version``
             The version of the Bible like NIV for New International Version
@@ -369,9 +373,19 @@
             log.error(u'No books found in the Bibleserver response.')
             send_error_message(u'parse')
             return None
-        content = content.findAll(u'li')
+        content = content.find_all(u'li')
         return [book.contents[0].contents[0] for book in content]
 
+    def _get_application(self):
+        """
+        Adds the openlp to the class dynamically
+        """
+        if not hasattr(self, u'_application'):
+            self._application = Registry().get(u'application')
+        return self._application
+
+    application = property(_get_application)
+
 
 class CWExtract(object):
     """
@@ -404,14 +418,12 @@
         if not soup:
             return None
         self.application.process_events()
-        html_verses = soup.findAll(u'span', u'versetext')
+        html_verses = soup.find_all(u'span', u'versetext')
         if not html_verses:
             log.error(u'No verses found in the CrossWalk response.')
             send_error_message(u'parse')
             return None
         verses = {}
-        reduce_spaces = re.compile(r'[ ]{2,}')
-        fix_punctuation = re.compile(r'[ ]+([.,;])')
         for verse in html_verses:
             self.application.process_events()
             verse_number = int(verse.contents[0].contents[0])
@@ -432,11 +444,10 @@
                                 if isinstance(subsub, NavigableString):
                                     verse_text += subsub
             self.application.process_events()
-            # Fix up leading and trailing spaces, multiple spaces, and spaces
-            # between text and , and .
+            # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
             verse_text = verse_text.strip(u'\n\r\t ')
-            verse_text = reduce_spaces.sub(u' ', verse_text)
-            verse_text = fix_punctuation.sub(r'\1', verse_text)
+            verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text)
+            verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
             verses[verse_number] = verse_text
         return SearchResults(book_name, chapter, verses)
 
@@ -458,7 +469,7 @@
             log.error(u'No books found in the Crosswalk response.')
             send_error_message(u'parse')
             return None
-        content = content.findAll(u'li')
+        content = content.find_all(u'li')
         books = []
         for book in content:
             book = book.find(u'a')
@@ -481,9 +492,8 @@
 
     def __init__(self, parent, **kwargs):
         """
-        Finds all the bibles defined for the system
-        Creates an Interface Object for each bible containing connection
-        information
+        Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection
+        information.
 
         Throws Exception if no Bibles are found.
 
@@ -492,8 +502,7 @@
         BibleDB.__init__(self, parent, **kwargs)
         self.download_source = kwargs[u'download_source']
         self.download_name = kwargs[u'download_name']
-        # TODO: Clean up proxy stuff. We probably want one global proxy per
-        # connection type (HTTP and HTTPS) at most.
+        # TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most.
         self.proxy_server = None
         self.proxy_username = None
         self.proxy_password = None
@@ -508,8 +517,8 @@
 
     def do_import(self, bible_name=None):
         """
-        Run the import. This method overrides the parent class method. Returns
-        ``True`` on success, ``False`` on failure.
+        Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on
+        failure.
         """
         self.wizard.progress_bar.setMaximum(68)
         self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...'))
@@ -549,8 +558,7 @@
             if self.stop_import_flag:
                 break
             self.wizard.increment_progress_bar(translate(
-                'BiblesPlugin.HTTPBible', 'Importing %s...',
-                'Importing <book name>...') % book)
+                'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing <book name>...') % book)
             book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id)
             if not book_ref_id:
                 log.exception(u'Importing books from %s - download name: "%s" '\
@@ -567,22 +575,19 @@
 
     def get_verses(self, reference_list, show_error=True):
         """
-        A reimplementation of the ``BibleDB.get_verses`` method, this one is
-        specifically for web Bibles. It first checks to see if the particular
-        chapter exists in the DB, and if not it pulls it from the web. If the
-        chapter DOES exist, it simply pulls the verses from the DB using the
-        ancestor method.
+        A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first
+        checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter
+        DOES exist, it simply pulls the verses from the DB using the ancestor method.
 
         ``reference_list``
-            This is the list of references the media manager item wants. It is
-            a list of tuples, with the following format::
+            This is the list of references the media manager item wants. It is a list of tuples, with the following
+            format::
 
                 (book_reference_id, chapter, start_verse, end_verse)
 
-            Therefore, when you are looking for multiple items, simply break
-            them up into references like this, bundle them into a list. This
-            function then runs through the list, and returns an amalgamated
-            list of ``Verse`` objects. For example::
+            Therefore, when you are looking for multiple items, simply break them up into references like this, bundle
+            them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse``
+            objects. For example::
 
                 [(u'35', 1, 1, 1), (u'35', 2, 2, 3)]
         """
@@ -643,7 +648,7 @@
         Return the number of chapters in a particular book.
 
         ``book``
-        The book object to get the chapter count for.
+            The book object to get the chapter count for.
         """
         log.debug(u'HTTPBible.get_chapter_count("%s")', book.name)
         return BiblesResourcesDB.get_chapter_count(book.book_reference_id)
@@ -671,8 +676,7 @@
 
     application = property(_get_application)
 
-def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
-    pre_parse_substitute=None, cleaner=None):
+def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, pre_parse_substitute=None):
     """
     Gets a webpage and returns a parsed and optionally cleaned soup or None.
 
@@ -683,14 +687,11 @@
         An optional HTTP header to pass to the bible web server.
 
     ``pre_parse_regex``
-        A regular expression to run on the webpage. Allows manipulation of the
-        webpage before passing to BeautifulSoup for parsing.
+        A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup
+        for parsing.
 
     ``pre_parse_substitute``
         The text to replace any matches to the regular expression with.
-
-    ``cleaner``
-        An optional regex to use during webpage parsing.
     """
     if not reference_url:
         return None
@@ -703,10 +704,8 @@
         page_source = re.sub(pre_parse_regex, pre_parse_substitute, page_source)
     soup = None
     try:
-        if cleaner:
-            soup = BeautifulSoup(page_source, markupMassage=cleaner)
-        else:
-            soup = BeautifulSoup(page_source)
+        soup = BeautifulSoup(page_source)
+        CLEANER_REGEX.sub(u'', unicode(soup))
     except HTMLParseError:
         log.exception(u'BeautifulSoup could not parse the bible page.')
     if not soup:

=== modified file 'openlp/plugins/custom/forms/editcustomform.py'
--- openlp/plugins/custom/forms/editcustomform.py	2013-03-28 21:49:11 +0000
+++ openlp/plugins/custom/forms/editcustomform.py	2013-04-17 18:37:26 +0000
@@ -257,6 +257,6 @@
         # We must have at least one slide.
         if self.slide_list_view.count() == 0:
             critical_error_message_box(message=translate('CustomPlugin.EditCustomForm',
-                'You need to add at least one slide'))
+                'You need to add at least one slide.'))
             return False
         return True

=== modified file 'scripts/check_dependencies.py'
--- scripts/check_dependencies.py	2013-03-30 21:54:42 +0000
+++ scripts/check_dependencies.py	2013-04-17 18:37:26 +0000
@@ -79,7 +79,7 @@
     'lxml',
     'chardet',
     'enchant',
-    'BeautifulSoup',
+    'bs4',
     'mako',
     'migrate',
     'uno',

=== modified file 'tests/interfaces/openlp_plugins/custom/forms/test_customform.py'
--- tests/interfaces/openlp_plugins/custom/forms/test_customform.py	2013-03-30 09:01:21 +0000
+++ tests/interfaces/openlp_plugins/custom/forms/test_customform.py	2013-04-17 18:37:26 +0000
@@ -62,3 +62,45 @@
             QtTest.QTest.mouseClick(self.form.add_button, QtCore.Qt.LeftButton)
             #THEN: One slide should be added.
             assert self.form.slide_list_view.count() == 1, u'There should be one slide added.'
+
+    def validate_not_valid_part1_test(self):
+        """
+        Test the _validate() method.
+        """
+        # GIVEN: Mocked methods.
+        with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \
+                mocked_critical_error_message_box:
+            mocked_displayText = MagicMock()
+            mocked_displayText.return_value = u''
+            self.form.title_edit.displayText = mocked_displayText
+            mocked_setFocus = MagicMock()
+            self.form.title_edit.setFocus = mocked_setFocus
+
+            # WHEN: Call the method.
+            result = self.form._validate()
+
+            # THEN: The validate method should have returned False.
+            assert not result, u'The _validate() method should have retured False'
+            mocked_setFocus.assert_called_with()
+            mocked_critical_error_message_box.assert_called_with(message=u'You need to type in a title.')
+
+    def validate_not_valid_part2_test(self):
+        """
+        Test the _validate() method.
+        """
+        # GIVEN: Mocked methods.
+        with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \
+                mocked_critical_error_message_box:
+            mocked_displayText = MagicMock()
+            mocked_displayText.return_value = u'something'
+            self.form.title_edit.displayText = mocked_displayText
+            mocked_count = MagicMock()
+            mocked_count.return_value = 0
+            self.form.slide_list_view.count = mocked_count
+
+            # WHEN: Call the method.
+            result = self.form._validate()
+
+            # THEN: The validate method should have returned False.
+            assert not result, u'The _validate() method should have retured False'
+            mocked_critical_error_message_box.assert_called_with(message=u'You need to add at least one slide.')

_______________________________________________
Mailing list: https://launchpad.net/~openlp-core
Post to     : openlp-core@lists.launchpad.net
Unsubscribe : https://launchpad.net/~openlp-core
More help   : https://help.launchpad.net/ListHelp

[Openlp-core] [Merge] lp:~googol/openlp/bs4 into lp:openlp

Reply via email to