From 4ca4751bfc3d6efd637aa342b6daeefafe167cd9 Mon Sep 17 00:00:00 2001 From: tomMulholland Date: Tue, 22 Apr 2014 20:36:49 -0500 Subject: [PATCH 1/4] Update search.py modified the functions _extract_results and _extract_description according to these suggestions: http://www.catonmat.net/c/27124 --- xgoogle/search.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xgoogle/search.py b/xgoogle/search.py index 98b681e..85d0460 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -322,7 +322,8 @@ def _extract_info(self, soup): return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} def _extract_results(self, soup): - results = soup.findAll('p', {'class': 'g'}) + #results = soup.findAll('p', {'class': 'g'}) + results = soup.findAll('li','g') ret_res = [] for result in results: eres = self._extract_result(result) @@ -352,7 +353,8 @@ def _extract_title_url(self, result): return title, url def _extract_description(self, result): - desc_td = result.findNext('td') + #desc_td = result.findNext('td') + desc_div = result.find('span', 'st')) if not desc_td: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None From 0db69c674626f7a2b1702a7ae1179e1b8471389b Mon Sep 17 00:00:00 2001 From: Justin Vieira Date: Fri, 17 Jul 2015 13:55:57 -0400 Subject: [PATCH 2/4] fix extra '(' in master --- xgoogle/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xgoogle/search.py b/xgoogle/search.py index 85d0460..d5306fc 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -354,7 +354,7 @@ def _extract_title_url(self, result): def _extract_description(self, result): #desc_td = result.findNext('td') - desc_div = result.find('span', 'st')) + desc_div = result.find('span', 'st') if not desc_td: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None From 2ec56085c4bb46885f53298ea207a10aac1a3939 Mon Sep 17 00:00:00 2001 From: Justin Vieira Date: Fri, 17 Jul 2015 16:45:37 -0400 Subject: [PATCH 3/4] Example 1 would only return 5 results (50 were expected). This was because it would not return any results missing a Title or Desc - most were missing Desc - apparently this may have become way more common recently, or there may be a bug retrieving it somewhere? Changed this so it now returns the correct 50 results, ignoring the incomplete fields. --- examples/example1.py | 9 ++++++--- xgoogle/search.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/example1.py b/examples/example1.py index 79e14cb..a7b4107 100755 --- a/examples/example1.py +++ b/examples/example1.py @@ -10,9 +10,12 @@ gs.results_per_page = 50 results = gs.get_results() for res in results: - print res.title.encode('utf8') - print res.desc.encode('utf8') - print res.url.encode('utf8') + if res.title is not None: + print res.title.encode('utf8') + if res.desc is not None: + print res.desc.encode('utf8') + if res.url is not None: + print res.url.encode('utf8') print except SearchError, e: print "Search failed: %s" % e diff --git a/xgoogle/search.py b/xgoogle/search.py index d5306fc..fa9569b 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -237,8 +237,8 @@ def _extract_results(self, soup): def _extract_result(self, result): title, url = self._extract_title_url(result) desc = self._extract_description(result) - if not title or not url or not desc: - return None + #if not title or not url or not desc: + # return None return SearchResult(title, url, desc) def _extract_title_url(self, result): From dd29c8058ab526250cadbc15cd8902c901a9afa7 Mon Sep 17 00:00:00 2001 From: Justin Vieira Date: Fri, 17 Jul 2015 17:17:05 -0400 Subject: [PATCH 4/4] Added example that shows cycling through all results, and being a "good citizen" to Google by waiting a bit after each results page. Also turned on "random_agent" user-agent randomization by default. --- examples/example4.py | 34 ++++++++++++++++++++++++++++++++++ xgoogle/search.py | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 examples/example4.py diff --git a/examples/example4.py b/examples/example4.py new file mode 100644 index 0000000..6053d1e --- /dev/null +++ b/examples/example4.py @@ -0,0 +1,34 @@ +#!/usr/bin/python +# +# Justin Vieira (justin@rancorsoft.com) +# http://www.rancorsoft.com -- Let's Rock Together. +# +# This program does a Google search for "super test results" and returns +# all results. +# + +from xgoogle.search import GoogleSearch, SearchError +from threading import Thread +from random import randint +import time + +try: + gs = GoogleSearch("super test results") + gs.results_per_page = 50 + displayedResults = 0 + results = gs.get_results() + while displayedResults < gs.num_results: + for res in results: + if res.title is not None: + print res.title.encode('utf8') + if res.desc is not None: + print res.desc.encode('utf8') + if res.url is not None: + print res.url.encode('utf8') + displayedResults += gs.results_per_page + print + time.sleep(randint(15,60)) + results = gs.get_results() +except SearchError, e: + print "Search failed: %s" % e + diff --git a/xgoogle/search.py b/xgoogle/search.py index fa9569b..19af0fe 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -55,7 +55,7 @@ class GoogleSearch(object): SEARCH_URL_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search" NEXT_PAGE_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" - def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): + def __init__(self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None): self.query = query self.debug = debug self.browser = Browser(debug=debug)