From 9976122f09f920808227c5fbcde6b81e498aad08 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 4 Jun 2020 23:15:04 +0100 Subject: [PATCH 1/7] core: extract pandoc converter, polar: fix unconverted comment headings --- modules/polar.py | 58 +++++++++++---------------------------------- scripts/ci/run | 3 ++- src/orger/pandoc.py | 48 +++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 45 deletions(-) create mode 100644 src/orger/pandoc.py diff --git a/modules/polar.py b/modules/polar.py index 64638aa..086824b 100755 --- a/modules/polar.py +++ b/modules/polar.py @@ -15,13 +15,23 @@ """ -from orger import StaticView -from orger.inorganic import node, link +from orger import Mirror +from orger.inorganic import node, link, OrgNode from orger.common import dt_heading +from orger import pandoc -class PolarView(StaticView): + +class PolarView(Mirror): def get_items(self): from my.reading import polar + + def make_comment(c: polar.Comment) -> OrgNode: + text = pandoc.to_org(data=c.text, from_='html', logger=self.logger) + return node( + heading=dt_heading(c.created, text.splitlines()[0]), + body=text, + ) + def make_item(res: polar.Result): if isinstance(res, polar.Error): # TODO could create error heading from exception automatically? take first line as heading and rest + traceback as the body @@ -39,10 +49,7 @@ def make_item(res: polar.Result): heading=dt_heading(hl.created, hl.selection), tags=hl.tags, properties=None if hl.color is None else {'POLAR_COLOR': hex2name(hl.color)}, - children=[node( - heading=dt_heading(c.created, c.text.splitlines()[0]), - body=html2org(c.text, logger=self.logger), - ) for c in hl.comments] + children=[make_comment(c) for c in hl.comments], ) for hl in book.items] ) for res in polar.get_entries(): @@ -61,43 +68,6 @@ def hex2name(hexc: str) -> str: ) -# TODO move to base? -def html2org(html: str, logger) -> str: - # meh. for some reason they are converted to \\ otherwise - html = html.replace('
', '') - - - from subprocess import run, PIPE - try: - r = run( - ['pandoc', '-f', 'html', '-t', 'org', '--wrap=none'], - check=True, - input=html.encode('utf8'), - stdout=PIPE, - ) - except FileNotFoundError as fe: - import warnings - warnings.warn("Please install 'pandoc' to convert HTML to org-mode. See https://pandoc.org/installing.html") - except Exception as e: - logger.exception(e) - else: - return r.stdout.decode('utf8') - return html # fallback - - -# TODO decode text incoming from polar? - -def test_html2org(): - import logging - # html = "

and a comment too 


multiline!

" - # TODO ok, it's annoying... not sure what to do with nonpritable crap - html = "

and a comment too


multiline!

" - assert html2org(html, logger=logging) == r''' -and a /comment/ too - -*multiline*! -'''.lstrip() - if __name__ == '__main__': PolarView.main() diff --git a/scripts/ci/run b/scripts/ci/run index 82d691f..0d4e0e0 100755 --- a/scripts/ci/run +++ b/scripts/ci/run @@ -21,7 +21,8 @@ if ! [ -z "$CI" ]; then fi # vim is used in one of the tests -command -v vim || sudo apt install vim +command -v vim || sudo apt install vim +command -v pandoc || sudo apt install pandoc pip3 install --user tox tox diff --git a/src/orger/pandoc.py b/src/orger/pandoc.py new file mode 100644 index 0000000..e5f07fd --- /dev/null +++ b/src/orger/pandoc.py @@ -0,0 +1,48 @@ +""" +Helper for converting stuff to pandoc +""" +import logging +import shutil +from subprocess import run, PIPE +from typing import Optional + + +has_pandoc = shutil.which('pandoc') is not None + +if not has_pandoc: + import warnings + warnings.warn("Please install 'pandoc' to convert HTML to org-mode. See https://pandoc.org/installing.html") + + +def to_org(*, data: str, from_: str, logger=logging) -> str: + if not has_pandoc: + return data + # TODO batch?? + + # meh. for some reason they are converted to \\ otherwise + if from_ == 'html': + data = data.replace('
', '') + + try: + r = run( + ['pandoc', '-f', from_, '-t', 'org', '--wrap=none'], + check=True, + input=data.encode('utf8'), + stdout=PIPE, + ) + except Exception as e: + logger.exception(e) + return data # fallback + res = r.stdout.decode('utf8') + return res + + +def test(): + # html = "

and a comment too 


multiline!

" + # TODO ok, it's annoying... not sure what to do with nonpritable crap + html = "

and a comment too


multiline!

" + assert to_org(data=html, from_='html') == r''' +and a /comment/ too + +*multiline*! +'''.lstrip() From 052a93019d38fad53abc2ec3b7ab864ca59b080f Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Thu, 4 Jun 2020 23:34:29 +0100 Subject: [PATCH 2/7] core: add option to disable pandoc, reuse it in roam module --- README.org | 4 ++++ modules/roamresearch.py | 20 ++++---------------- src/orger/common.py | 1 + src/orger/org_view.py | 14 ++++++++++++-- src/orger/pandoc.py | 16 ++++++++++------ 5 files changed, 31 insertions(+), 24 deletions(-) diff --git a/README.org b/README.org index f6dc088..9d88aac 100644 --- a/README.org +++ b/README.org @@ -21,6 +21,10 @@ I write in detail about usecases and motivation for it [[https://beepb00p.xyz/or - =pip3 install --user .= - after that you can use =python3 -m orger.modules.modulename=, same way as the previous section, or run =modules/modulename.py= directly +- [optional]: install [[https://pandoc.org/installing.html][pandoc]], it might give you better org-mode outputs for some modules + + If you do have pandoc installed, but don't want the module to use it, pass =--disable-pandoc= flag to it. + * Usage and examples I usually run Orger modules overnight via cron. diff --git a/modules/roamresearch.py b/modules/roamresearch.py index bc2d7e0..fe49364 100755 --- a/modules/roamresearch.py +++ b/modules/roamresearch.py @@ -2,26 +2,14 @@ from itertools import chain from typing import Iterable -from orger import StaticView +from orger import Mirror from orger.inorganic import node, link, OrgNode from orger.common import dt_heading +from orger import pandoc import my.roamresearch as roamresearch -from subprocess import run, PIPE - -def md2org(text: str) -> str: - # TODO use batch?? or talk to a process - r = run( - ['pandoc', '-f', 'markdown', '-t', 'org', '--wrap=none'], - check=True, - input=text.encode('utf8'), - stdout=PIPE, - ) - return r.stdout.decode('utf8') - - # todo ^^ ^^ things are highlight? def roam_text_to_org(text: str) -> str: """ @@ -31,7 +19,7 @@ def roam_text_to_org(text: str) -> str: ('{{[[slider]]}}', ''), ]: text = text.replace(f, t) - org = md2org(text) + org = pandoc.to_org(text, from_='markdown') org = org.replace(r'\_', '_') # unescape, it's a bit aggressive.. return org @@ -87,7 +75,7 @@ def roam_note_to_org(node: roamresearch.Node, top=False) -> Iterable[OrgNode]: ) -class RoamView(StaticView): +class RoamView(Mirror): def get_items(self): rr = roamresearch.roam() from concurrent.futures import ThreadPoolExecutor diff --git a/src/orger/common.py b/src/orger/common.py index 59004b5..f85f539 100644 --- a/src/orger/common.py +++ b/src/orger/common.py @@ -6,6 +6,7 @@ class settings: DEFAULT_TIMESTAMP_STYLE = TimestampStyle.INACTIVE + USE_PANDOC: bool = True def dt_heading(dt: Optional[datetime], heading: str) -> str: diff --git a/src/orger/org_view.py b/src/orger/org_view.py index e573cbd..27ef8eb 100644 --- a/src/orger/org_view.py +++ b/src/orger/org_view.py @@ -61,10 +61,20 @@ def main_common(self) -> None: settings.DEFAULT_TIMESTAMP_STYLE = _style_map[timestamp_style] setup_logger(self.logger, level=logging.DEBUG) + pandoc = self.args.pandoc + settings.USE_PANDOC = pandoc + @classmethod def parser(cls) -> ArgumentParser: - p = ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120) + p = argparse.ArgumentParser(formatter_class=F) # type: ignore + p.add_argument( + '--disable-pandoc', + action='store_false', + dest='pandoc', + help='Pass to disable pandoc conversions to org-mode (it might be slow in some cases)', + ) p.add_argument( '--timestamps', type=str, @@ -93,7 +103,7 @@ class Mirror(OrgView): @classmethod def main(cls, setup_parser=None) -> None: p = cls.parser() - p.add_argument('--to', type=Path, default=Path(cls.name() + '.org')) + p.add_argument('--to', type=Path, default=Path(cls.name() + '.org'), help='Filename to output') if setup_parser is not None: setup_parser(p) diff --git a/src/orger/pandoc.py b/src/orger/pandoc.py index e5f07fd..0257e62 100644 --- a/src/orger/pandoc.py +++ b/src/orger/pandoc.py @@ -7,15 +7,19 @@ from typing import Optional -has_pandoc = shutil.which('pandoc') is not None +from .common import settings -if not has_pandoc: - import warnings - warnings.warn("Please install 'pandoc' to convert HTML to org-mode. See https://pandoc.org/installing.html") +if settings.USE_PANDOC: + has_pandoc = shutil.which('pandoc') is not None - -def to_org(*, data: str, from_: str, logger=logging) -> str: if not has_pandoc: + import warnings + warnings.warn("Please install 'pandoc' to convert HTML to org-mode. See https://pandoc.org/installing.html") + settings.USE_PANDOC = False + + +def to_org(data: str, *, from_: str, logger=logging) -> str: + if not settings.USE_PANDOC: return data # TODO batch?? From 5b8046c14917be94bb68f9fb58a9d30b8367cd12 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 5 Jun 2020 00:01:06 +0100 Subject: [PATCH 3/7] github: use pandoc for markdown conversion & nicer bodies --- modules/github.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/modules/github.py b/modules/github.py index 7528916..2833400 100755 --- a/modules/github.py +++ b/modules/github.py @@ -2,8 +2,11 @@ from orger import Mirror from orger.inorganic import node, link from orger.common import dt_heading, error +from orger import pandoc import my.coding.github as gh +# todo use later: import my.github.ghexport as gh. also careful about using events() -- need to sort? +# I guess makes sense to generally expose get_ methods? class Github(Mirror): @@ -13,14 +16,22 @@ def get_items(self) -> Mirror.Results: yield error(e) continue # TODO filter only events that have body? e.g. not sure if much point emitting pull requests here + summary = e.summary + body = e.body + if body is None: + lines = summary.splitlines(keepends=True) + if len(lines) > 1: + summary = lines[0].strip() + body = ''.join(lines[1:]) # todo meh. hacky, better to extract bodies in the provider properly + if body.strip() == '': + body = None + yield node( dt_heading( e.dt, - link(url=e.link, title=e.summary) if e.link is not None else e.summary + link(url=e.link, title=summary) if e.link is not None else summary ), - # TODO would be nice to convert from markdown to org here - # TODO use pandoc thingie? make it configurable too - body=e.body, + body=None if body is None else pandoc.to_org(body, from_='gfm'), # github flavored markdown ) From d9802359176ab619c5c1117809b76844b9a0f33a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 5 Jun 2020 18:46:10 +0100 Subject: [PATCH 4/7] queue: prompt for creating file when using interactive shell --- README.org | 7 ++----- src/orger/org_view.py | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/README.org b/README.org index 9d88aac..600359e 100644 --- a/README.org +++ b/README.org @@ -93,9 +93,9 @@ print(orger.Queue.__doc__) #+RESULTS: :results: - *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are appended to the output org-mode file. + *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are added to the output org-mode file. - To keep track of old/new items, it's using a separate JSON =state= file. + To keep track of previously seen iteems, it's using a separate JSON =state= file. A typical usecase is a todo list, or a content processing queue. You can use such a module as you use any other org-mode file: schedule/refile/comment/set priorities, etc. @@ -106,9 +106,6 @@ print(orger.Queue.__doc__) You can run such a module as: - : # initialize the state file first to avoid surprises (you only need to do it once) - : ./orger_module.py --to /path/to/output.org --state /path/to/state.json --init - : # after that you can just run it: : ./orger_module.py --to /path/to/output.org --state /path/to/state.json * FAQ diff --git a/src/orger/org_view.py b/src/orger/org_view.py index 27ef8eb..ead8993 100644 --- a/src/orger/org_view.py +++ b/src/orger/org_view.py @@ -173,9 +173,9 @@ def test(): class Queue(OrgView): """ - *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are appended to the output org-mode file. + *Queue* (old name =InteractiveView=): works as a queue, *only previously unseen items* from the data source are added to the output org-mode file. - To keep track of old/new items, it's using a separate JSON =state= file. + To keep track of previously seen iteems, it's using a separate JSON =state= file. A typical usecase is a todo list, or a content processing queue. You can use such a module as you use any other org-mode file: schedule/refile/comment/set priorities, etc. @@ -193,7 +193,14 @@ def _run( dry_run: bool=False, ) -> None: if not to.exists() and not init: - raise RuntimeError(f"target {to} doesn't exist! Try running with --init") + err = RuntimeError(f"{to} doesn't exist! Try running with --init") + import sys + if sys.stdin.isatty(): + resp = input(f"{to} doesn't exist. Create empty file? y/n ").strip().lower() + if resp != 'y': + raise err + else: + raise err state = JsonState( path=state_path, @@ -232,10 +239,10 @@ def get_items(self) -> Iterable[OrgWithKey]: @classmethod def main(cls, setup_parser=None) -> None: p = cls.parser() - p.add_argument('--to' , type=Path, default=Path(cls.name() + '.org') , help='file where new items are appended') + p.add_argument('--to' , type=Path, default=Path(cls.name() + '.org') , help='file where new items are added') p.add_argument('--state', type=Path, default=Path(cls.name() + '.state.json'), help='state file for keeping track of handled items') - p.add_argument('--init', action='store_true') - p.add_argument('--dry-run', action='store_true') + p.add_argument('--init', action='store_true') # todo not sure if I really need it? + p.add_argument('--dry-run', action='store_true', help='Run without modifying the state file') if setup_parser is not None: setup_parser(p) From 2223d01583cac26da1e3be171f212b2d51544206 Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 5 Jun 2020 18:59:45 +0100 Subject: [PATCH 5/7] queue: fallback to user config dir for state file --- README.org | 8 +++++++- setup.py | 5 ++++- src/orger/common.py | 6 ++++++ src/orger/org_view.py | 6 ++++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.org b/README.org index 600359e..086b8dc 100644 --- a/README.org +++ b/README.org @@ -106,7 +106,13 @@ print(orger.Queue.__doc__) You can run such a module as: - : ./orger_module.py --to /path/to/output.org --state /path/to/state.json + : ./orger_module.py --to /path/to/output.org + + This will keep the state file in your user config dir (e.g. =~/.config/orger/=). + + Alternatively, you can pass the state file explicitly: + + : ./orger_module.py --to /path/to/output.org --state /path/to/state.json * FAQ - Why are the files output by some modules read only? diff --git a/setup.py b/setup.py index 5cfde5d..91d8e72 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,10 @@ def main(): author_email='karlicoss@gmail.com', description='Converts data into org-mode', - install_requires=['atomicwrites'], + install_requires=[ + 'appdirs' , # to keep state files + 'atomicwrites', # to safely append data to a file + ], extras_require={ 'testing': ['pytest'], 'linting': [ diff --git a/src/orger/common.py b/src/orger/common.py index f85f539..e334c1b 100644 --- a/src/orger/common.py +++ b/src/orger/common.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import Optional +from pathlib import Path from .inorganic import OrgNode, timestamp, timestamp_with_style, TimestampStyle @@ -45,3 +46,8 @@ def todo(dt: datetime, **kwargs): # todo use klogging2? from .klogging import LazyLogger, setup_logger + + +def orger_user_dir() -> Path: + import appdirs # type: ignore[import] + return Path(appdirs.user_config_dir('orger')) diff --git a/src/orger/org_view.py b/src/orger/org_view.py index ead8993..74017e9 100644 --- a/src/orger/org_view.py +++ b/src/orger/org_view.py @@ -11,7 +11,7 @@ from .inorganic import OrgNode, TimestampStyle from .state import JsonState from .atomic_append import PathIsh, atomic_append_check, assert_not_edited -from .common import setup_logger +from .common import setup_logger, orger_user_dir # TODO tests for determinism? not sure where should they be... # think of some generic thing to test that? @@ -202,6 +202,7 @@ def _run( else: raise err + state_path.parent.mkdir(parents=True, exist_ok=True) # not sure... state = JsonState( path=state_path, logger=self.logger, @@ -238,9 +239,10 @@ def get_items(self) -> Iterable[OrgWithKey]: @classmethod def main(cls, setup_parser=None) -> None: + default_state = orger_user_dir() / 'states' / (cls.name() + '.state.json') p = cls.parser() p.add_argument('--to' , type=Path, default=Path(cls.name() + '.org') , help='file where new items are added') - p.add_argument('--state', type=Path, default=Path(cls.name() + '.state.json'), help='state file for keeping track of handled items') + p.add_argument('--state', type=Path, default=default_state, help='state file for keeping track of handled items') p.add_argument('--init', action='store_true') # todo not sure if I really need it? p.add_argument('--dry-run', action='store_true', help='Run without modifying the state file') if setup_parser is not None: From 7eb1797cf032be8772eb15b2072d07d62e09dbdc Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 5 Jun 2020 19:13:25 +0100 Subject: [PATCH 6/7] youtube: mark videos that were deleted --- modules/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/youtube.py b/modules/youtube.py index f8ae0d0..37c5f13 100755 --- a/modules/youtube.py +++ b/modules/youtube.py @@ -3,25 +3,26 @@ from orger.inorganic import node, link from orger.common import dt_heading -from my.media.youtube import get_watched +from my.media.youtube import watched from itertools import groupby class YoutubeView(Mirror): def get_items(self) -> Mirror.Results: - watched = get_watched() by_url = lambda w: w.url by_when = lambda w: w.when items = [ max(group, key=by_when) - for _, group in groupby(sorted(watched, key=by_url), key=by_url) + for _, group in groupby(sorted(watched(), key=by_url), key=by_url) ] items = sorted(items, key=by_when) # TODO for each url only take latest? for item in items: + deleted = item.url == item.title # todo move to HPI? + l = link(title=item.title + (' (DELETED)' if deleted else ''), url=item.url) yield (item.url, node( - heading=dt_heading(item.when, link(title=item.title, url=item.url)), + heading=dt_heading(item.when, l), )) From 85b114973801f3cce92e4581010aeb16ab4d7e5a Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Fri, 5 Jun 2020 19:14:45 +0100 Subject: [PATCH 7/7] reddit: rename to reddit2org --- modules/{reddit.py => reddit2org.py} | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) rename modules/{reddit.py => reddit2org.py} (94%) diff --git a/modules/reddit.py b/modules/reddit2org.py similarity index 94% rename from modules/reddit.py rename to modules/reddit2org.py index 834b3bd..0ac6edb 100755 --- a/modules/reddit.py +++ b/modules/reddit2org.py @@ -2,13 +2,14 @@ """ Better interface for reading saved reddit posts/comments """ -from orger import InteractiveView +from orger import Mirror from orger.inorganic import node, link from orger.common import dt_heading from my.reddit import saved -class RedditView(InteractiveView): + +class RedditView(Mirror): def get_items(self): for s in saved(): yield s.sid, node( @@ -21,6 +22,7 @@ def get_items(self): body=s.text, ) + # todo this could be generic, i.e. checking all urls? def is_dead_url(self, url: str) -> bool: assert self.cmdline_args is not None if not self.cmdline_args.mark_dead: