1
0
Fork 0
mirror of https://github.com/marty-oehme/scripts.git synced 2025-12-10 22:12:45 +00:00

New version

This commit is contained in:
Maboroshy 2017-09-03 20:29:59 +04:00 committed by GitHub
parent 03d7a8ca37
commit 3eb80115a4
6 changed files with 292 additions and 70 deletions

View file

@ -2,19 +2,20 @@
import os
import re
import sys
import time
import argparse
import platform
import collections
import multiprocessing.dummy
import md_link
import md_convert
import safe_path
def text_to_md(file_attrs):
"""
This will process specified text file getting its tags and replacing urls with favicons and titles where possible
This will process specified text file getting its topics and replacing urls with favicons and titles where possible
:param file_attrs: File_attrs named tuple
:return: list of Note_attrs named tuple
"""
@ -24,8 +25,8 @@ def text_to_md(file_attrs):
with open(file_attrs.file_path, 'r') as text_file:
text = text_file.read()
tags = re.findall(file_attrs.tag_marker + '(\w*)', text)
text = re.sub(file_attrs.tag_marker + '\w*[ ]?', '', text).strip()
topics = re.findall(file_attrs.topic_marker + '(\w*)', text)
text = re.sub(file_attrs.topic_marker + '\w*[ ]?', '', text).strip()
if re.match('^http[s]?://[^\s]*$', text):
is_bookmark = True
@ -38,18 +39,24 @@ def text_to_md(file_attrs):
if is_bookmark:
bookmark_title = url.title
if file_attrs.inbox_file:
output_files = [file_attrs.inbox_file]
if file_attrs.output_file and file_attrs.output_file != '*no mtime*':
output_files = [file_attrs.output_file]
headline_title = ''
elif tags:
output_files = [tag + '.md' for tag in tags]
elif topics:
output_files = [topic + '.md' for topic in topics]
headline_title = ''
elif is_bookmark:
output_files = [time.strftime('%m-%d %H:%M', mtime) + ' ' + bookmark_title + '.md']
headline_title = '# {}\n'.format(bookmark_title)
if file_attrs.output_file == '*no mtime*':
output_files = [bookmark_title + '.md']
else:
output_files = [time.strftime('%m-%d %H:%M', mtime) + ' ' + bookmark_title + '.md']
else:
output_files = [time.strftime('%m-%d %H:%M', mtime) + ' ' + filename + '.md']
headline_title = '# {}\n'.format(filename)
if file_attrs.output_file == '*no mtime*':
output_files = [filename + '.md']
else:
output_files = [time.strftime('%m-%d %H:%M', mtime) + ' ' + filename + '.md']
output = []
for output_file in output_files:
@ -60,6 +67,24 @@ def text_to_md(file_attrs):
title=headline_title))
return output
def html_to_md(file_attrs, pandoc_bin='pandoc', pandoc_ver=''):
"""
This will move specified convert specified html file to markdown and move all in-line images to sub-folder at media directory
:param file_attrs: File_attrs named tuple
:return: Note_attrs named tuple
"""
html_file_name_noext = os.path.splitext(os.path.basename(file_attrs.file_path))[0]
mtime = time.localtime(os.path.getmtime(file_attrs.file_path))
md_text = md_convert.saved_html(file_attrs.file_path, file_attrs.folder_dir_path,
pandoc_bin=pandoc_bin, pandoc_ver=pandoc_ver)
if not md_text:
return
return Note_attrs(input_file_path=file_attrs.file_path,
output_file_path=file_attrs.output_dir_path + os.sep + safe_path.filename(html_file_name_noext + '.md'),
text=md_text,
mtime='**{}** \n'.format(time.strftime('%x %a %X', mtime)),
title='')
def file_to_md(file_attrs, media_dir_name):
"""
@ -79,8 +104,10 @@ def file_to_md(file_attrs, media_dir_name):
file = md_link.File(new_path, file_attrs.folder_dir_path, os.path.splitext(os.path.basename(file_attrs.file_path))[0])
if file_attrs.inbox_file:
output_file = file_attrs.inbox_file
if file_attrs.output_file == '*no mtime*':
output_file = file.title + '.md'
elif file_attrs.output_file:
output_file = file_attrs.output_file
else:
output_file = time.strftime('%m-%d %H:%M', mtime) + ' ' + file.title + '.md'
@ -93,15 +120,16 @@ def file_to_md(file_attrs, media_dir_name):
if __name__ == '__main__':
File_attrs = collections.namedtuple('File_attrs', 'file_path folder_dir_path output_dir_path tag_marker inbox_file')
File_attrs = collections.namedtuple('File_attrs', 'file_path folder_dir_path output_dir_path topic_marker output_file')
"""
A named tuple which functions use to pass input data - data of files to be processed
:param file_path: full absolute path to the file to process
:param folder_dir_path: full absolute path to directory where 'media' and 'attachment' directories are
:param output_dir_path: full absolute path to directory where resulting text file will be stored
:param tag_marker: symbol(s) which start the tag word (for text files)
:param inbox_file: full absolute path to the text file which will be appended with a new entry,
if none the entry will go to new standalone text file
:param topic_marker: symbol(s) which start the 'topic' word (for text files)
:param output_file: empty for new standalone text file with mtime in the name,
'*no mtime*' for or new standalone text file without mtime in the name
or full absolute path to the text file which will be appended with a new entry
"""
Note_attrs = collections.namedtuple('Note_attrs', 'input_file_path output_file_path text mtime title')
@ -118,17 +146,35 @@ if __name__ == '__main__':
:param file_attrs: File_attrs named tuple
:return: Note_attrs named tuple
"""
if file_attrs.file_path.endswith('.txt') or not os.path.splitext(file_attrs.file_path)[1]:
return text_to_md(file_attrs)
elif args.pandoc_bin and args.pandoc_ver and file_attrs.file_path.endswith(('.htm', '.html')):
return html_to_md(file_attrs, args.pandoc_bin, args.pandoc_ver)
elif file_attrs.file_path.endswith(('.jpg', '.png', '.gif')):
return file_to_md(file_attrs, 'media')
else:
return file_to_md(file_attrs, 'attachments')
inbox_dir = sys.argv[1]
folder_dir = sys.argv[2]
tag_marker = sys.argv[3]
arg_parser = argparse.ArgumentParser(description='A script to turn everything in the inbox directory to markdown notes.')
arg_parser.add_argument('-i', '--inbox', action='store', dest='inbox_dir', required=True,
help="Full absolute path to the inbox directory to organize")
arg_parser.add_argument('-f', '--folder', action='store', dest='folder_dir', required=True,
help="Full absolute path to directory where 'media' and 'attachment' directories are")
arg_parser.add_argument('-m', '--marker', action='store', dest='topic_marker', required=False, default='@',
help="Symbol(s) which start the 'topic' word (for text files)")
arg_parser.add_argument('-s', '--scan-folder', action='store_true', dest='scan_folder', required=False,
help="Process whole folder rather than only inbox")
arg_parser.add_argument('-p', '--pandoc-bin', action='store', dest='pandoc_bin', required=False,
help="Command/path to run pandoc")
arg_parser.add_argument('-pv', '--pandoc-ver', action='store', dest='pandoc_ver', required=False,
help="Installed pandoc version")
args = arg_parser.parse_args()
inbox_dir = args.inbox_dir
folder_dir = args.folder_dir
topic_marker = args.topic_marker
os.makedirs(inbox_dir, exist_ok=True)
os.makedirs(folder_dir + os.sep + 'media', exist_ok=True)
@ -136,27 +182,35 @@ if __name__ == '__main__':
# Prepare a list of File_attrs tuples for process_by_ext function, based on file location, older files first
file_list = []
if args.scan_folder:
for file_path in sorted([folder_dir + os.sep + path for path in os.listdir(folder_dir)], key=os.path.getmtime):
if os.path.isfile(file_path) and not file_path.endswith(('.md', 'notes.sqlite')) \
and not os.path.basename(file_path).startswith(('.', inbox_dir, folder_dir + os.sep + 'media', folder_dir + os.sep + 'attachments')):
file_list.append([File_attrs(file_path=file_path, folder_dir_path=folder_dir, output_dir_path=os.path.dirname(file_path),
topic_marker=topic_marker, output_file='*no mtime*')])
for file_path in sorted([inbox_dir + os.sep + path for path in os.listdir(inbox_dir)], key=os.path.getmtime):
if os.path.isdir(file_path) and not os.path.basename(file_path).startswith('.'):
if os.path.isdir(file_path) and not os.path.basename(file_path).startswith('.') and not file_path.endswith('_files'):
for sub_file in sorted([file_path + os.sep + path for path in os.listdir(file_path)], key=os.path.getmtime):
if not sub_file.endswith('.md') and not os.path.basename(sub_file).startswith('.'):
file_list.append([File_attrs(file_path=sub_file, folder_dir_path=folder_dir, output_dir_path=inbox_dir,
tag_marker=tag_marker, inbox_file=os.path.basename(file_path) + '.md')])
topic_marker=topic_marker, output_file=os.path.basename(file_path) + '.md')])
else:
if not file_path.endswith('.md') and not os.path.basename(file_path).startswith('.'):
if os.path.isfile(file_path) and not file_path.endswith('.md') and not os.path.basename(file_path).startswith('.'):
file_list.append([File_attrs(file_path=file_path, folder_dir_path=folder_dir, output_dir_path=inbox_dir,
tag_marker=tag_marker, inbox_file='')])
topic_marker=topic_marker, output_file='')])
# Run process_by_ext for each File_attrs tuple putting resulted Note_attrs tuples to write_list
write_list = multiprocessing.dummy.Pool().starmap(process_by_ext, file_list)
# Run process_by_ext for each File_attrs tuple putting resulting Note_attrs tuples to write_list
write_list = multiprocessing.dummy.Pool(100).starmap(process_by_ext, file_list)
# Due to text_to_md outputs list of Note_attrs tuples, this should turn write_list to a flat list
flat_write_list = []
for object in write_list:
if type(object) == list:
for item in object:
flat_write_list.append(item)
else:
if type(item) == Note_attrs:
flat_write_list.append(item)
elif type(object) == Note_attrs:
flat_write_list.append(object)
# Create or append existing text files based on Note_attrs tuples data