mirror of
https://github.com/marty-oehme/scripts.git
synced 2025-12-10 22:12:45 +00:00
New version
This commit is contained in:
parent
03d7a8ca37
commit
3eb80115a4
6 changed files with 292 additions and 70 deletions
69
inbox/md_convert.py
Normal file
69
inbox/md_convert.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
import re
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import urllib.parse
|
||||
import distutils.version
|
||||
|
||||
|
||||
def html_text(html_text, pandoc_bin='pandoc', pandoc_ver='1.19.1'):
|
||||
"""
|
||||
This will convert html_text to markdown by running pandoc_bin and return markdown text
|
||||
:param html_text: html text to convert
|
||||
:param pandoc_bin: command/path to run pandoc
|
||||
:param pandoc_ver: pandoc version as string to use appropriate set of options
|
||||
:return: converted markdown text
|
||||
"""
|
||||
if distutils.version.LooseVersion(pandoc_ver) < distutils.version.LooseVersion('1.16'):
|
||||
pandoc_args = [pandoc_bin, '-f', 'html', '-t', 'markdown_strict+pipe_tables-raw_html', '--no-wrap']
|
||||
elif distutils.version.LooseVersion(pandoc_ver) < distutils.version.LooseVersion('1.19'):
|
||||
pandoc_args = [pandoc_bin, '-f', 'html', '-t', 'markdown_strict+pipe_tables-raw_html', '--wrap=none']
|
||||
else:
|
||||
pandoc_args = [pandoc_bin, '-f', 'html', '-t', 'markdown_strict+pipe_tables-raw_html', '--wrap=none',
|
||||
'--atx-headers']
|
||||
|
||||
# Remove firefox reader mode panel if there's one
|
||||
html_text = re.sub('<ul id="reader-toolbar" class="toolbar">.*</li></ul></ul>', '', html_text, flags=re.DOTALL)
|
||||
|
||||
try:
|
||||
pandoc_pipe = subprocess.Popen(pandoc_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
md_text = pandoc_pipe.communicate(input=html_text.encode('utf-8'))[0].decode('utf-8')
|
||||
except:
|
||||
return
|
||||
|
||||
return md_text
|
||||
|
||||
def saved_html(html_path, folder_dir_path, pandoc_bin='pandoc', pandoc_ver='1.19.1'):
|
||||
"""
|
||||
This will convert html_text to markdown by running pandoc_bin and return markdown text
|
||||
I will also move all in-line images to media directory at folder_dir_path and correct the links accordingly
|
||||
:param html_path: full absolute path to saved html file to convert, with '_files' dir at the same directory
|
||||
:param folder_dir_path: full absolute path to directory where 'media' directory is
|
||||
:param pandoc_bin: command/path to run installed pandoc
|
||||
:param pandoc_ver: pandoc version to use appropriate set of options
|
||||
:return:
|
||||
"""
|
||||
with open(html_path, 'r') as html:
|
||||
md_text = html_text(html.read(), pandoc_bin, pandoc_ver)
|
||||
|
||||
if not md_text:
|
||||
return
|
||||
|
||||
image_links = re.findall('!\[[^]]*] *\(([^)]*)', md_text) # TODO What if folder name has brackets
|
||||
for link in image_links:
|
||||
link_path_tuple = os.path.split(urllib.parse.unquote(link))
|
||||
file_path = os.path.join(os.path.dirname(html_path), *link_path_tuple)
|
||||
new_file_path = os.path.join(folder_dir_path, 'media', *link_path_tuple)
|
||||
new_link_path = 'file://media/' + '/'.join(link_path_tuple)
|
||||
|
||||
md_text = md_text.replace(link, new_link_path)
|
||||
|
||||
try:
|
||||
os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
|
||||
os.rename(file_path, new_file_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
shutil.rmtree(os.path.splitext(html_path)[0] + '_files', True)
|
||||
|
||||
return md_text
|
||||
Loading…
Add table
Add a link
Reference in a new issue