Skip to content
This repository has been archived by the owner on Apr 25, 2023. It is now read-only.

WORK IN PROGRESS Specification for using Legs scraping framework

Ori Hoch edited this page Sep 1, 2014 · 3 revisions

This document will detail the required specifications / requirements for using the Legs framework for the Open Knesset (and other similar projects) scraping needs.

User stories

Moving the lobbyists scraping processes from okscraper to Legs

The main json legs file for lobbyists will look like this:

[
    {
        "action" : "FETCH/url",
        "values" : {
            "url" : "http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx"
        },
        "yields" : "lobbyistIndexHtml"
    },
    {
        "action" : "EXTRACT_HTML_XPATH/lobbyistsIndexHtml/selector",
        "values" : {
            "selector" : "//*[@lobbyist_id]/@lobbyist_id"
        },
        "yields" : "lobbyist_ids"
    },
    {
        "action" : "MAP_PAR/lobbyist_ids/toParam/extractData",
        "yields" : "lobbyistJsons",
        "values" : {
            "toParam":"lobbyist_id",
            "extractData" : [
                {
                    "action" : "REPLACE_REGEX/lobbyist_id/match/replace",
                    "values" : {
                        "match" : "(.*)",
                        "replace" : "http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist($1)"
                    },
                    "yields" : "lobbyistUrl"
                },
                {
                    "action" : "FETCH/url",
                    "yields" : "lobbyistHtml"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:lobbyist_id/text()"
                    },
                    "yields" : "id"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:first_name/text()"
                    },
                    "yields" : "first_name"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:family_name/text()"
                    },
                    "yields" : "family_name"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:profession/text()"
                    },
                    "yields" : "profession"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:faction_name/text()"
                    },
                    "yields" : "faction_name"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:lobyst_permit_type/text()"
                    },
                    "yields" : "lobyst_permit_type"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:corporation_name/text()"
                    },
                    "yields" : "corporation_name"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:corporation_id/text()"
                    },
                    "yields" : "corporation_id"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
                    "values" : {
                        "match" : "//d:faction_member/text()"
                    },
                    "yields" : "faction_member"
                },
                {
                    "action" : "REPLACE_REGEX/lobbyist_id/match/replace",
                    "values" : {
                        "match" : "(.*)",
                        "replace" : "http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist($1)/lobbist_type"
                    },
                    "yields" : "lobbyistRepresentsUrl"
                },
                {
                    "action" : "FETCH/lobbyistRepresentsUrl",
                    "yields" : "lobbyistRepresentsHtml"
                },
                {
                    "action" : "EXTRACT_HTML_XPATH/lobbyistRepresentsHtml/selector",
                    "values" : {
                        "selector" : "//content"
                    },
                    "yields" : "lobbyistRepresentsContents"
                },
                {
                    "action" : "MAP_PAR/lobbyistRepresentsContents/toParam/steps",
                    "yields" : "lobbyistRepresentJsons"
                    "values" : {
                        "toParam" : "lobbyistRepresentsContent",
                        "steps" : [
                            {
                                "EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
                                "values" : { "selector" : "//d:lobbyist_represent_id/text()" },
                                "yields" : "lobbyist_represent_id"
                            },
                            {
                                "EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
                                "values" : { "selector" : "//d:lobbyist_represent_name/text()" },
                                "yields" : "lobbyist_represent_name"
                            },
                            {
                                "EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
                                "values" : { "selector" : "//d:lobbyist_represent_domain/text()" },
                                "yields" : "lobbyist_represent_domain"
                            },
                            {
                                "EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
                                "values" : { "selector" : "//d:lobbyist_represent_type/text()" },
                                "yields" : "lobbyist_represent_type"
                            },
                            {
                                "AS_JSON/keys",
                                "values" : {
                                    "keys" : [
                                        "lobbyist_represent_id", "lobbyist_represent_name",
                                        "lobbyist_represent_domain", "lobbyist_represent_type"
                                    ]
                                }
                            }
                        ]
                    }
                },
                {
                    "AS_JSON/keys",
                    "values" : {
                        "keys" : [
                            "id", "first_name", "family_name", "profession", "faction_name",
                            "lobyst_permit_type", "corpoaration_name", "corporation_id",
                            "faction_member", "lobbyistRepresentJsons"
                        ]
                    }
                }
            ]
        }
    },
    {
        "action" : "TO_FILE/keys/file",
        "values" : {
            "keys" : ["lobbyistJsons"],
            "file" : "lobbyists.json"
        }
    }
]

The above json outputs a lobbyists.json file with all the aggregated data.

We will run it using legs.jar lobbyists.json --log log.txt

The log file will contain a log in a concise and clear format with log levels etc.

Optional - add actions in the json file which output to the log