fill-template

#!/usr/bin/env python3
"""
Fill a LaTeX document templated with Jinja2 and render to PDF for every row in
an input table.

Template parameters are read from the CSV/TSV file specified with --params and
are interpolated via \VAR{column_name} constructs in LaTeX.  Other templating
is possible, refer to <https://pythonhosted.org/latex/>.

The rendered PDFs are saved to file paths specified by --output, which should
be a Python f-string making use of the template parameters to construct a
unique filename for every row of input data.

For example, given a TSV that looks like this:

\b
    barcode	birth_date	pat_name
    AAAAAAAA	1966-05-09	Kermit the Frog
    BBBBBBBB	1969-07-21      Bert

running:

\b
    fill-template \\
        --template scan/report-en.tex \\
        --params sesame-results.tsv \\
        --output '{barcode}-{birth_date}-en.pdf'

will produce two files:

\b
    AAAAAAAA-1966-05-09-en.pdf
    BBBBBBBB-1969-07-21-en.pdf

"""
import click
import fsspec
import logging
import os
import pandas
from datetime import date
from itertools import cycle
from jinja2 import FileSystemLoader
from multiprocessing import Pool
from latex.build import PdfLatexBuilder
from latex.jinja2 import make_env
from os.path import basename, dirname
from sys import stdin, stderr


LOG_LEVEL = os.environ.get("LOG_LEVEL", "debug").upper()


logging.basicConfig(
    level = logging.ERROR,
    format = "[%(asctime)s] %(levelname)-8s %(message)s",
    datefmt = "%Y-%m-%d %H:%M:%S%z",
    stream = stderr)

logging.captureWarnings(True)

log = logging.getLogger(__name__)
log.setLevel(LOG_LEVEL)


# XeLaTeX has the best Unicode support of the various LaTeX engines, which is
# important since we'll be preparing reports in 12 languages.
xelatex = PdfLatexBuilder("xelatex")
assert xelatex.is_available(), "xelatex is not available"


@click.command("fill-template", help = __doc__, no_args_is_help = True)

@click.option("--template", "template_path",
    metavar = "<report.tex>",
    help = "Path to a LaTeX document templated with Jinja2; "
           "syntax described at <https://pythonhosted.org/latex/>",
    type = click.Path(exists = True, dir_okay = False, resolve_path = True),
    required = True)

@click.option("--params", "params_path",
    metavar = "<results.csv>",
    help = "Path to a table (CSV or TSV) of template parameter values; "
           "each row produces one output file. "
           "Default is stdin.",
    default = "-")

@click.option("--output", "output_path",
    metavar = "<path.pdf>",
    help = "Path to which PDFs are written; should use Python f-string "
           "syntax to produce a unique path for each input row.",
    required = True)

@click.option("--filter", "filter_query",
    metavar = "<condition>",
    help = "A Pandas DataFrame query string for filtering the parameter table")

@click.option("--workers", "worker_count",
    metavar = "<n>",
    help = "Number of parallel worker processes to start for PDF rendering",
    show_default = True,
    type = int,
    # Number of CPUs available to this process; see doc for os.cpu_count()
    default = len(os.sched_getaffinity(0)))

@click.pass_context

def __main__(ctx, *, template_path, params_path, output_path, filter_query, worker_count):
    if params_path == "-":
        params_path = "/dev/stdin"

    with Pool(worker_count) as workers:
        log.debug(f"Started pool of n={worker_count:,} workers")

        with fsspec.open(params_path) as file:
            param_table = read_params(file, filter_query)

        results = workers.starmap(
            render, zip(
                cycle([str(template_path)]),
                cycle([output_path]),
                param_table ))

        errors = [
            output_file
                for output_file, successful in results
                 if not successful ]

    if errors:
        log.error(f"Errors were encountered (n={len(errors)}) during processing of: {errors!r}")
        ctx.exit(1)


def render(template_path, output_path, param_row):
    output_name = output_path.format(**param_row)

    with fsspec.open(output_name, "wb") as output_file:
        try:
            log.info(f"Generating {output_name}")
            pdf_renderer(template_path)(param_row, output_file)
            return output_name, True
        except:
            log.exception(f"Error generating {output_name}")
            return output_name, False


def pdf_renderer(template_path):
    template_dir = dirname(template_path)

    jinja = make_env(loader = FileSystemLoader(template_dir))
    template = jinja.get_template(basename(template_path))

    # Allow the template to reference files relative to itself.  The empty
    # string includes the default locations, as described in `man tex`.
    texinputs = [template_dir, ""]

    def render_pdf(params, output_file):
        # Convenience globals useful for reports.
        params.setdefault("current_date", str(date.today()))

        return (
            xelatex
                .build_pdf(template.render(params), texinputs = texinputs)
                .save_to(output_file))

    return render_pdf


def read_params(file, filter_query):
    params = pandas.read_csv(file, sep = None, dtype = "string", engine = "python")

    row_count = len(params)
    log.info(f"Loaded {row_count:,} parameter rows")

    if filter_query:
        params.query(filter_query, inplace = True)

        filtered_row_count = row_count - len(params)
        log.info(f"Filtered out {filtered_row_count:,} parameter rows where {filter_query}")

    return params.to_dict("records")


if __name__ == "__main__":
    __main__()