aboutsummaryrefslogtreecommitdiff
path: root/src/str_format.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/str_format.py')
-rw-r--r--src/str_format.py72
1 files changed, 72 insertions, 0 deletions
diff --git a/src/str_format.py b/src/str_format.py
new file mode 100644
index 0000000..5d8c412
--- /dev/null
+++ b/src/str_format.py
@@ -0,0 +1,72 @@
+import pathlib
+import re
+import unicodedata
+
+def safe_path(name):
+ simplified = ''.join([alnum_or_space(c) for c in unaccent(name.lower())])
+ return '-'.join(simplified.split())
+
+def unaccent(s):
+ return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
+
+def alnum_or_space(c):
+ if c.isalnum():
+ return c
+ else:
+ return ' '
+
+def cleanup_text(s, lang):
+ s = re.sub('\'', '’', s)
+ s = re.sub(r'\.\.\.', '…', s)
+ s = re.sub(r'\. \. \.', '…', s)
+ s = cleanup_double_quotes(s, lang)
+ s = cleanup_paragraphs(s)
+
+ if lang == 'fr':
+ s = re.sub('“', '«', s)
+ s = re.sub('”', '»', s)
+
+ # Replace space by insecable spaces
+ s = re.sub(r' ([:?\!])', r' \1', s)
+ s = re.sub('« ', '« ', s)
+ s = re.sub(' »', ' »', s)
+
+ # Add missing insecable spaces
+ s = re.sub(r'([^ ]):', r'\1 :', s)
+ s = re.sub(r'([^ ])\?', r'\1 ?', s)
+ s = re.sub(r'([^ ])\!', r'\1 !', s)
+ s = re.sub(r'([^ ])»', r'\1 »', s)
+ s = re.sub(r'«([^ ])', r'« \1', s)
+
+ elif lang == 'en':
+ s = re.sub('« ', '“', s)
+ s = re.sub(' »', '”', s)
+ s = re.sub('«', '“', s)
+ s = re.sub('»', '”', s)
+
+ return s
+
+def cleanup_double_quotes(s, lang):
+ res = ''
+ quoted = False
+ for c in s:
+ if c == '"':
+ if quoted:
+ quoted = False
+ if lang == 'fr':
+ res += '»'
+ elif lang == 'en':
+ res += '”'
+ else:
+ quoted = True
+ if lang == 'fr':
+ res += '«'
+ elif lang == 'en':
+ res += '“'
+ else:
+ res += c
+ return res
+
+def cleanup_paragraphs(s):
+ ps = [f' {p.strip()}' for p in re.split(r'\n+', s) if p.strip()]
+ return '\n\n'.join(ps)