File tree Expand file tree Collapse file tree 4 files changed +26
-13
lines changed
Expand file tree Collapse file tree 4 files changed +26
-13
lines changed Original file line number Diff line number Diff line change @@ -157,12 +157,16 @@ strip_document
157157 within the document are unaffected.
158158 Defaults to ``STRIP ``.
159159
160- beautiful_soup_parser
161- Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
162- as `html5lib `, `lxml ` or even a custom parser as long as it is installed on the execution
163- environment. Defaults to ``html.parser ``.
164-
165- .. _BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/
160+ bs4_options
161+ Specify additional configuration options for the ``BeautifulSoup `` object
162+ used to interpret the HTML markup. String and list values (such as ``lxml ``
163+ or ``html5lib ``) are treated as ``features `` arguments to control parser
164+ selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"} ``)
165+ are treated as full kwargs to be used for the BeautifulSoup constructor,
166+ allowing specification of any parameter. For parameter details, see the
167+ Beautiful Soup documentation at:
168+
169+ .. _BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/bs4/doc/
166170
167171Options may be specified as kwargs to the ``markdownify `` function, or as a
168172nested ``Options `` class in ``MarkdownConverter `` subclasses.
Original file line number Diff line number Diff line change @@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
154154class MarkdownConverter (object ):
155155 class DefaultOptions :
156156 autolinks = True
157- beautiful_soup_parser = 'html.parser'
157+ bs4_options = 'html.parser'
158158 bullets = '*+-' # An iterable of bullet types.
159159 code_language = ''
160160 code_language_callback = None
@@ -188,11 +188,15 @@ def __init__(self, **options):
188188 raise ValueError ('You may specify either tags to strip or tags to'
189189 ' convert, but not both.' )
190190
191+ # If a string or list is passed to bs4_options, assume it is a 'features' specification
192+ if not isinstance (self .options ['bs4_options' ], dict ):
193+ self .options ['bs4_options' ] = {'features' : self .options ['bs4_options' ]}
194+
191195 # Initialize the conversion function cache
192196 self .convert_fn_cache = {}
193197
194198 def convert (self , html ):
195- soup = BeautifulSoup (html , self .options ['beautiful_soup_parser ' ])
199+ soup = BeautifulSoup (html , ** self .options ['bs4_options ' ])
196200 return self .convert_soup (soup )
197201
198202 def convert_soup (self , soup ):
Original file line number Diff line number Diff line change @@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
7070 parser .add_argument ('-w' , '--wrap' , action = 'store_true' ,
7171 help = "Wrap all text paragraphs at --wrap-width characters." )
7272 parser .add_argument ('--wrap-width' , type = int , default = 80 )
73- parser .add_argument ('-p' , '--beautiful-soup-parser' ,
74- dest = 'beautiful_soup_parser' ,
73+ parser .add_argument ('--bs4-options' ,
7574 default = 'html.parser' ,
76- help = "Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77- "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78- "environment ." )
75+ help = "Specifies the parser that BeautifulSoup should use to parse "
76+ "the HTML markup. Examples include 'html5. parser', 'lxml', and "
77+ "'html5lib' ." )
7978
8079 args = parser .parse_args (argv )
8180 print (markdownify (** vars (args )))
Original file line number Diff line number Diff line change @@ -32,3 +32,9 @@ def test_strip_document():
3232 assert markdownify ("<p>Hello</p>" , strip_document = RSTRIP ) == "\n \n Hello"
3333 assert markdownify ("<p>Hello</p>" , strip_document = STRIP ) == "Hello"
3434 assert markdownify ("<p>Hello</p>" , strip_document = None ) == "\n \n Hello\n \n "
35+
36+
37+ def bs4_options ():
38+ assert markdownify ("<p>Hello</p>" , bs4_options = "html.parser" ) == "Hello"
39+ assert markdownify ("<p>Hello</p>" , bs4_options = ["html.parser" ]) == "Hello"
40+ assert markdownify ("<p>Hello</p>" , bs4_options = {"features" : "html.parser" }) == "Hello"
You can’t perform that action at this time.
0 commit comments