This repository has been archived by the owner on Apr 25, 2023. It is now read-only.
forked from ofri/Open-Knesset
-
Notifications
You must be signed in to change notification settings - Fork 174
WORK IN PROGRESS Specification for using Legs scraping framework
Ori Hoch edited this page Sep 1, 2014
·
3 revisions
This document will detail the required specifications / requirements for using the Legs framework for the Open Knesset (and other similar projects) scraping needs.
The main json legs file for lobbyists will look like this:
[
{
"action" : "FETCH/url",
"values" : {
"url" : "http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx"
},
"yields" : "lobbyistIndexHtml"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistsIndexHtml/selector",
"values" : {
"selector" : "//*[@lobbyist_id]/@lobbyist_id"
},
"yields" : "lobbyist_ids"
},
{
"action" : "MAP_PAR/lobbyist_ids/toParam/extractData",
"yields" : "lobbyistJsons",
"values" : {
"toParam":"lobbyist_id",
"extractData" : [
{
"action" : "REPLACE_REGEX/lobbyist_id/match/replace",
"values" : {
"match" : "(.*)",
"replace" : "http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist($1)"
},
"yields" : "lobbyistUrl"
},
{
"action" : "FETCH/url",
"yields" : "lobbyistHtml"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:lobbyist_id/text()"
},
"yields" : "id"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:first_name/text()"
},
"yields" : "first_name"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:family_name/text()"
},
"yields" : "family_name"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:profession/text()"
},
"yields" : "profession"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:faction_name/text()"
},
"yields" : "faction_name"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:lobyst_permit_type/text()"
},
"yields" : "lobyst_permit_type"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:corporation_name/text()"
},
"yields" : "corporation_name"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:corporation_id/text()"
},
"yields" : "corporation_id"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistHtml/selector",
"values" : {
"match" : "//d:faction_member/text()"
},
"yields" : "faction_member"
},
{
"action" : "REPLACE_REGEX/lobbyist_id/match/replace",
"values" : {
"match" : "(.*)",
"replace" : "http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist($1)/lobbist_type"
},
"yields" : "lobbyistRepresentsUrl"
},
{
"action" : "FETCH/lobbyistRepresentsUrl",
"yields" : "lobbyistRepresentsHtml"
},
{
"action" : "EXTRACT_HTML_XPATH/lobbyistRepresentsHtml/selector",
"values" : {
"selector" : "//content"
},
"yields" : "lobbyistRepresentsContents"
},
{
"action" : "MAP_PAR/lobbyistRepresentsContents/toParam/steps",
"yields" : "lobbyistRepresentJsons"
"values" : {
"toParam" : "lobbyistRepresentsContent",
"steps" : [
{
"EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
"values" : { "selector" : "//d:lobbyist_represent_id/text()" },
"yields" : "lobbyist_represent_id"
},
{
"EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
"values" : { "selector" : "//d:lobbyist_represent_name/text()" },
"yields" : "lobbyist_represent_name"
},
{
"EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
"values" : { "selector" : "//d:lobbyist_represent_domain/text()" },
"yields" : "lobbyist_represent_domain"
},
{
"EXTRACT_HTML_XPATH/lobbyistRepresentsContent/selector",
"values" : { "selector" : "//d:lobbyist_represent_type/text()" },
"yields" : "lobbyist_represent_type"
},
{
"AS_JSON/keys",
"values" : {
"keys" : [
"lobbyist_represent_id", "lobbyist_represent_name",
"lobbyist_represent_domain", "lobbyist_represent_type"
]
}
}
]
}
},
{
"AS_JSON/keys",
"values" : {
"keys" : [
"id", "first_name", "family_name", "profession", "faction_name",
"lobyst_permit_type", "corpoaration_name", "corporation_id",
"faction_member", "lobbyistRepresentJsons"
]
}
}
]
}
},
{
"action" : "TO_FILE/keys/file",
"values" : {
"keys" : ["lobbyistJsons"],
"file" : "lobbyists.json"
}
}
]
The above json outputs a lobbyists.json file with all the aggregated data.
We will run it using legs.jar lobbyists.json --log log.txt
The log file will contain a log in a concise and clear format with log levels etc.
Optional - add actions in the json file which output to the log