diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 148d340..51978e0 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -85,7 +85,14 @@ def chomp(text): space, strip the string and return a space as suffix of prefix, if needed. This function is used to prevent conversions like foo => ** foo** + + If the text is whitespace-only, preserve it as a single space instead of + returning an empty string (fixes issue #155). """ + # Handle whitespace-only text: preserve as single space (fixes #155) + if text and not text.strip(): + return ('', '', ' ') + prefix = ' ' if text and text[0] == ' ' else '' suffix = ' ' if text and text[-1] == ' ' else '' text = text.strip() @@ -111,6 +118,10 @@ def implementation(self, el, text, parent_tags): prefix, suffix, text = chomp(text) if not text: return '' + # If text is whitespace-only, return just the whitespace without markup + # This preserves spaces from tags like (fixes #155) + if text.isspace(): + return text return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix) return implementation diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 6123d8c..320f7fc 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -3,9 +3,11 @@ def test_chomp(): assert md(' ') == ' ' - assert md(' ') == ' ' - assert md(' ') == ' ' - assert md(' ') == ' ' + # With fix for issue #155, whitespace-only content is preserved as a single space + # so ' ' becomes ' ' (before) + ' ' (preserved) + ' ' (after) = ' ' + assert md(' ') == ' ' + assert md(' ') == ' ' + assert md(' ') == ' ' assert md(' s ') == ' **s** ' assert md(' s ') == ' **s** ' assert md(' s ') == ' **s** ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index dd99dfb..c47c3d3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -374,3 +374,35 @@ def test_spaces(): assert md('
foobar') == 'test\n\n```\n foo\n```\n\nbar' + + +def test_whitespace_only_inline_tags(): + """ + Test that whitespace-only inline tags preserve the whitespace. + Fixes issue #155: https://github.com/matthewwithanm/python-markdownify/issues/155 + + When DOCX files have formatting where a space is in its own formatting run + (e.g., "further" [normal] + " " [bold] + "reference" [normal]), the HTML + produced is: further reference + + Previously, this would be converted to "furtherreference" (losing the space). + After the fix, it should be "further reference" (space preserved). + """ + # Whitespace-only strong/b tags should preserve the space + assert md('further reference') == 'further reference' + assert md('word1 word2') == 'word1 word2' + + # Whitespace-only em/i tags should preserve the space + assert md('hello world') == 'hello world' + assert md('foo bar') == 'foo bar' + + # Multiple whitespace characters should collapse to single space + assert md('a b') == 'a b' + assert md('a b') == 'a b' + + # Mixed formatting with whitespace boundary (real-world DOCX pattern) + assert md('The TRUST, but without further reference') == 'The **TRUST,** but without further reference' + + # Tabs and other whitespace should also be preserved as single space + assert md('a\tb') == 'a b' + assert md('a\nb') == 'a b'