From cfdcb6b03b15638188cee8353664504c459e89fd Mon Sep 17 00:00:00 2001 From: "nikola.milosevic" Date: Mon, 14 Jul 2014 13:06:48 +0100 Subject: [PATCH 1/9] Added class for face image search --- examples/ImageExample.py | 17 +++ xgoogle/search.py | 265 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 examples/ImageExample.py diff --git a/examples/ImageExample.py b/examples/ImageExample.py new file mode 100644 index 0000000..d625649 --- /dev/null +++ b/examples/ImageExample.py @@ -0,0 +1,17 @@ +# +# This program does a Google search for face images for "quick and dirty" and returns +# 50 results. +# + +from xgoogle.search import GoogleFaceImageSearch, SearchError +try: + gs = GoogleFaceImageSearch("quick and dirty") + gs.results_per_page = 50 + results = gs.get_results() + for res in results: + print res.trumb.encode('utf8') + print res.url.encode('utf8') + print +except SearchError, e: + print "Search failed: %s" % e + diff --git a/xgoogle/search.py b/xgoogle/search.py index 98b681e..a4b381a 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -3,6 +3,8 @@ # # Peteris Krumins (peter@catonmat.net) # http://www.catonmat.net -- good coders code, great reuse +# Updated by Nikola Milosevic (nikola.milosevic@inspiratron.org +# http://www.inspiratron.org # # http://www.catonmat.net/blog/python-library-for-google-search/ # @@ -39,6 +41,14 @@ def __str__(self): def html(self): return self.tag.prettify() + +class FaceImageSearchResult: + def __init__(self, trumb, url): + self.url = url + self.trumb = trumb + + def __str__(self): + return 'Google Search Result: "%s"' % self.title class SearchResult: def __init__(self, title, url, desc): @@ -298,6 +308,261 @@ def ascii_replacer(m): s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + + +class GoogleFaceImageSearch(object): + SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s" + NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&start=%(start)d" + SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d" + NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug) + self.results_info = None + self.eor = False # end of results + self._page = 0 + self._first_indexed_in_previous = None + self._filetype = None + self._last_search_url = None + self._results_per_page = 10 + self._last_from = 0 + self._lang = lang + self._tld = tld + + if re_search_strings: + self._re_search_strings = re_search_strings + elif lang == "de": + self._re_search_strings = ("Ergebnisse", "von", u"ungefähr") + elif lang == "es": + self._re_search_strings = ("Resultados", "de", "aproximadamente") + # add more localised versions here + else: + self._re_search_strings = ("Results", "of", "about") + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _get_first_indexed_in_previous(self): + return self._first_indexed_in_previous + + def _set_first_indexed_in_previous(self, interval): + if interval == "day": + self._first_indexed_in_previous = 'd' + elif interval == "week": + self._first_indexed_in_previous = 'w' + elif interval == "month": + self._first_indexed_in_previous = 'm' + elif interval == "year": + self._first_indexed_in_previous = 'y' + else: + # a floating point value is a number of months + try: + num = float(interval) + except ValueError: + raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval)) + self._first_indexed_in_previous = 'm' + str(interval) + + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") + + def _get_filetype(self): + return self._filetype + + def _set_filetype(self, filetype): + self._filetype = filetype + + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + MAX_VALUE = 1000000 + page = self._get_results_page() + results = self._extract_results(page) + search_info = {'from': self.results_per_page*self._page, + 'to': self.results_per_page*self._page + len(results), + 'total': MAX_VALUE} + if not self.results_info: + self.results_info = search_info + if self.num_results == 0: + self.eor = True + return [] + if not results: + self.eor = True + return [] + if self._page > 0 and search_info['from'] == self._last_from: + self.eor = True + return [] + if search_info['to'] == search_info['total']: + self.eor = True + self._page += 1 + self._last_from = search_info['from'] + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = GoogleFaceImageSearch.SEARCH_URL_0 + else: + url = GoogleFaceImageSearch.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = GoogleFaceImageSearch.NEXT_PAGE_0 + else: + url = GoogleFaceImageSearch.NEXT_PAGE_1 + + safe_url = [url % { 'query': urllib.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page, + 'tld' : self._tld, + 'lang' : self._lang }] + + # possibly extend url with optional properties + if self._first_indexed_in_previous: + safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) + if self._filetype: + safe_url.extend(["&as_filetype=", self._filetype]) + + safe_url = "".join(safe_url) + self._last_search_url = safe_url + + try: + page = self.browser.get_page(safe_url) + except BrowserError, e: + raise SearchError, "Failed getting %s: %s" % (e.url, e.error) + + return BeautifulSoup(page) + + def _extract_info(self, soup): + empty_info = {'from': 0, 'to': 0, 'total': 0} + div_ssb = soup.find('div', id='ssb') + if not div_ssb: + self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) + return empty_info + p = div_ssb.find('p') + if not p: + self._maybe_raise(ParseError, """

tag within

was not found on Google search page""", soup) + return empty_info + txt = ''.join(p.findAll(text=True)) + txt = txt.replace(',', '') + matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) + if not matches: + return empty_info + return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} + + def _extract_results(self, soup): + # Should extract + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return unichr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return unichr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) class BlogSearch(GoogleSearch): From 5327fc93e28cb625a624f1e996514897e0f65422 Mon Sep 17 00:00:00 2001 From: "nikola.milosevic" Date: Mon, 14 Jul 2014 13:11:02 +0100 Subject: [PATCH 2/9] Update of about files --- contributors.txt | 3 +++ readme.txt | 9 ++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/contributors.txt b/contributors.txt index 53e917d..debc2ae 100755 --- a/contributors.txt +++ b/contributors.txt @@ -1,5 +1,8 @@ This file contains people who have helped xgoogle project: + * Nikola Milosevic + Thanks for adding: + * Face image search * Holger Berndt Thanks for adding: * 'lang' and 'tld' arguments to Google Search diff --git a/readme.txt b/readme.txt index 3835b39..96d80f2 100755 --- a/readme.txt +++ b/readme.txt @@ -1,7 +1,10 @@ -This is a Google library called 'xgoogle'. Current version is 1.3. +This is a fork of a Google library called 'xgoogle'. Current version is 1.3.1 -It's written by Peteris Krumins (peter@catonmat.net). -His blog is at http://www.catonmat.net -- good coders code, great reuse. +It is forked by Nikola Milosevic (nikola.milosevic@inspiratron.org) from the original code that was written by Peteris Krumins (peter@catonmat.net). +Peteris Krumins blog is at http://www.catonmat.net -- good coders code, great reuse. +Nikola Milosevic's blog is at http://www.inspiratron.org. + +This fork adds Google face image search, and hopefully in the future Google image search The code is licensed under MIT license. From 1d76d7320abcebc8daa6e28307c9ea5cad5c62ff Mon Sep 17 00:00:00 2001 From: "nikola.milosevic" Date: Mon, 14 Jul 2014 13:12:35 +0100 Subject: [PATCH 3/9] Readme update --- readme.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/readme.txt b/readme.txt index 96d80f2..f33887c 100755 --- a/readme.txt +++ b/readme.txt @@ -190,13 +190,11 @@ v1.1: * added Google Sponsored Links Search. v1.2: * added Google Sets module v1.3: * added Google Translate module * fixed a bug in browser.py when KeyboardInterrupt did not get propagated. +v1.3: * added Google Face image search -------------------------------------------------------------------------- That's it. Have fun! :) -Sincerely, -Peteris Krumins -http://www.catonmat.net From d3996208a81fa2a7f0bfee2d08d8cac81f79f78d Mon Sep 17 00:00:00 2001 From: "nikola.milosevic" Date: Mon, 14 Jul 2014 13:31:47 +0100 Subject: [PATCH 4/9] Added norman image search --- examples/ImageExample2.py | 17 +++ readme.txt | 4 +- xgoogle/search.py | 255 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 examples/ImageExample2.py diff --git a/examples/ImageExample2.py b/examples/ImageExample2.py new file mode 100644 index 0000000..afe37bb --- /dev/null +++ b/examples/ImageExample2.py @@ -0,0 +1,17 @@ +# +# This program does a Google search for images for "quick and dirty" and returns +# 50 results. +# + +from xgoogle.search import GoogleImageSearch, SearchError +try: + gs = GoogleImageSearch("quick and dirty") + gs.results_per_page = 50 + results = gs.get_results() + for res in results: + print res.trumb.encode('utf8') + print res.url.encode('utf8') + print +except SearchError, e: + print "Search failed: %s" % e + diff --git a/readme.txt b/readme.txt index f33887c..6408af5 100755 --- a/readme.txt +++ b/readme.txt @@ -1,4 +1,4 @@ -This is a fork of a Google library called 'xgoogle'. Current version is 1.3.1 +This is a fork of a Google library called 'xgoogle'. Current version is 1.4 It is forked by Nikola Milosevic (nikola.milosevic@inspiratron.org) from the original code that was written by Peteris Krumins (peter@catonmat.net). Peteris Krumins blog is at http://www.catonmat.net -- good coders code, great reuse. @@ -190,7 +190,7 @@ v1.1: * added Google Sponsored Links Search. v1.2: * added Google Sets module v1.3: * added Google Translate module * fixed a bug in browser.py when KeyboardInterrupt did not get propagated. -v1.3: * added Google Face image search +v1.4: * added Google image and face image search -------------------------------------------------------------------------- diff --git a/xgoogle/search.py b/xgoogle/search.py index a4b381a..7934afa 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -308,6 +308,260 @@ def ascii_replacer(m): s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + +class GoogleImageSearch(object): + SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s" + NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&start=%(start)d" + SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d" + NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug) + self.results_info = None + self.eor = False # end of results + self._page = 0 + self._first_indexed_in_previous = None + self._filetype = None + self._last_search_url = None + self._results_per_page = 10 + self._last_from = 0 + self._lang = lang + self._tld = tld + + if re_search_strings: + self._re_search_strings = re_search_strings + elif lang == "de": + self._re_search_strings = ("Ergebnisse", "von", u"ungefähr") + elif lang == "es": + self._re_search_strings = ("Resultados", "de", "aproximadamente") + # add more localised versions here + else: + self._re_search_strings = ("Results", "of", "about") + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _get_first_indexed_in_previous(self): + return self._first_indexed_in_previous + + def _set_first_indexed_in_previous(self, interval): + if interval == "day": + self._first_indexed_in_previous = 'd' + elif interval == "week": + self._first_indexed_in_previous = 'w' + elif interval == "month": + self._first_indexed_in_previous = 'm' + elif interval == "year": + self._first_indexed_in_previous = 'y' + else: + # a floating point value is a number of months + try: + num = float(interval) + except ValueError: + raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval)) + self._first_indexed_in_previous = 'm' + str(interval) + + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") + + def _get_filetype(self): + return self._filetype + + def _set_filetype(self, filetype): + self._filetype = filetype + + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + MAX_VALUE = 1000000 + page = self._get_results_page() + results = self._extract_results(page) + search_info = {'from': self.results_per_page*self._page, + 'to': self.results_per_page*self._page + len(results), + 'total': MAX_VALUE} + if not self.results_info: + self.results_info = search_info + if self.num_results == 0: + self.eor = True + return [] + if not results: + self.eor = True + return [] + if self._page > 0 and search_info['from'] == self._last_from: + self.eor = True + return [] + if search_info['to'] == search_info['total']: + self.eor = True + self._page += 1 + self._last_from = search_info['from'] + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = GoogleImageSearch.SEARCH_URL_0 + else: + url = GoogleImageSearch.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = GoogleImageSearch.NEXT_PAGE_0 + else: + url = GoogleImageSearch.NEXT_PAGE_1 + + safe_url = [url % { 'query': urllib.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page, + 'tld' : self._tld, + 'lang' : self._lang }] + + # possibly extend url with optional properties + if self._first_indexed_in_previous: + safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) + if self._filetype: + safe_url.extend(["&as_filetype=", self._filetype]) + + safe_url = "".join(safe_url) + self._last_search_url = safe_url + + try: + page = self.browser.get_page(safe_url) + except BrowserError, e: + raise SearchError, "Failed getting %s: %s" % (e.url, e.error) + + return BeautifulSoup(page) + + def _extract_info(self, soup): + empty_info = {'from': 0, 'to': 0, 'total': 0} + div_ssb = soup.find('div', id='ssb') + if not div_ssb: + self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) + return empty_info + p = div_ssb.find('p') + if not p: + self._maybe_raise(ParseError, """

tag within

was not found on Google search page""", soup) + return empty_info + txt = ''.join(p.findAll(text=True)) + txt = txt.replace(',', '') + matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) + if not matches: + return empty_info + return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} + + def _extract_results(self, soup): + # Should extract + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return unichr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return unichr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) class GoogleFaceImageSearch(object): @@ -641,4 +895,5 @@ def looper(tag): desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc) + From 7a8e147171c9710f4d80dba72ee5181c0a0a0203 Mon Sep 17 00:00:00 2001 From: "nikola.milosevic" Date: Tue, 15 Jul 2014 12:22:14 +0100 Subject: [PATCH 5/9] Added Google video search. It requires NLTK --- examples/exampleVideoSearch.py | 22 +++ readme.txt | 1 + xgoogle/search.py | 294 ++++++++++++++++++++++++++++++++- 3 files changed, 316 insertions(+), 1 deletion(-) create mode 100755 examples/exampleVideoSearch.py diff --git a/examples/exampleVideoSearch.py b/examples/exampleVideoSearch.py new file mode 100755 index 0000000..807585c --- /dev/null +++ b/examples/exampleVideoSearch.py @@ -0,0 +1,22 @@ +# +# This program does a Google search for video for "Iron Maiden" and returns +# 50 results. Video search requires NLTK. For instruction on installation please visit http://www.nltk.org/install.html +# + +from xgoogle.search import GoogleVideoSearch, SearchError +try: + gs = GoogleVideoSearch("Iron Maiden") + gs.results_per_page = 50 + results = gs.get_results() + for res in results: + print 'Name: ' + res.name.encode('utf8') + print 'URL: ' + res.url.encode('utf8') + print 'Date: ' + res.date.encode('utf8') + print 'Duration: ' + res.duration.encode('utf8') + print 'Author: ' + res.author.encode('utf8') + print 'Description: ' + res.description.encode('utf8') + + print +except SearchError, e: + print "Search failed: %s" % e + diff --git a/readme.txt b/readme.txt index 6408af5..2d4e929 100755 --- a/readme.txt +++ b/readme.txt @@ -191,6 +191,7 @@ v1.2: * added Google Sets module v1.3: * added Google Translate module * fixed a bug in browser.py when KeyboardInterrupt did not get propagated. v1.4: * added Google image and face image search + * added Google video search (requires NLTK, for install instruction see http://www.nltk.org/install.html) -------------------------------------------------------------------------- diff --git a/xgoogle/search.py b/xgoogle/search.py index 7934afa..41129cf 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -15,6 +15,7 @@ import urllib from htmlentitydefs import name2codepoint from BeautifulSoup import BeautifulSoup +import nltk from browser import Browser, BrowserError @@ -42,13 +43,34 @@ def __str__(self): def html(self): return self.tag.prettify() + +# videoname = nltk.clean_html(str(h3[0])) +# video_url = result.findAll('cite') +# date_and_author = result.find('div',{'class':'f slp'}) +# da = str.split(str(date_and_author,' - Uploaded by')) +# date = da[0] +# author = da[1] +# desc = result.find('span',{'class':'st'}) +# description = nltk.clean_html(str(desc)) +class FaceVideoSearchResult: + def __init__(self, name, url, description,date,duration,author): + self.name = name + self.url = url + self.description = description + self.date = date + self.duration = duration + self.author= author + + def __str__(self): + return 'Google Search Result: "%s"' % self.name + class FaceImageSearchResult: def __init__(self, trumb, url): self.url = url self.trumb = trumb def __str__(self): - return 'Google Search Result: "%s"' % self.title + return 'Google Search Result: "%s"' % self.trumb class SearchResult: def __init__(self, title, url, desc): @@ -308,6 +330,276 @@ def ascii_replacer(m): s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + +class GoogleVideoSearch(object): + SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s" + NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&start=%(start)d" + SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d" + NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug) + self.results_info = None + self.eor = False # end of results + self._page = 0 + self._first_indexed_in_previous = None + self._filetype = None + self._last_search_url = None + self._results_per_page = 10 + self._last_from = 0 + self._lang = lang + self._tld = tld + + if re_search_strings: + self._re_search_strings = re_search_strings + elif lang == "de": + self._re_search_strings = ("Ergebnisse", "von", u"ungefähr") + elif lang == "es": + self._re_search_strings = ("Resultados", "de", "aproximadamente") + # add more localised versions here + else: + self._re_search_strings = ("Results", "of", "about") + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _get_first_indexed_in_previous(self): + return self._first_indexed_in_previous + + def _set_first_indexed_in_previous(self, interval): + if interval == "day": + self._first_indexed_in_previous = 'd' + elif interval == "week": + self._first_indexed_in_previous = 'w' + elif interval == "month": + self._first_indexed_in_previous = 'm' + elif interval == "year": + self._first_indexed_in_previous = 'y' + else: + # a floating point value is a number of months + try: + num = float(interval) + except ValueError: + raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval)) + self._first_indexed_in_previous = 'm' + str(interval) + + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") + + def _get_filetype(self): + return self._filetype + + def _set_filetype(self, filetype): + self._filetype = filetype + + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + MAX_VALUE = 1000000 + page = self._get_results_page() + results = self._extract_results(page) + search_info = {'from': self.results_per_page*self._page, + 'to': self.results_per_page*self._page + len(results), + 'total': MAX_VALUE} + if not self.results_info: + self.results_info = search_info + if self.num_results == 0: + self.eor = True + return [] + if not results: + self.eor = True + return [] + if self._page > 0 and search_info['from'] == self._last_from: + self.eor = True + return [] + if search_info['to'] == search_info['total']: + self.eor = True + self._page += 1 + self._last_from = search_info['from'] + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = GoogleVideoSearch.SEARCH_URL_0 + else: + url = GoogleVideoSearch.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = GoogleVideoSearch.NEXT_PAGE_0 + else: + url = GoogleVideoSearch.NEXT_PAGE_1 + + safe_url = [url % { 'query': urllib.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page, + 'tld' : self._tld, + 'lang' : self._lang }] + + # possibly extend url with optional properties + if self._first_indexed_in_previous: + safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) + if self._filetype: + safe_url.extend(["&as_filetype=", self._filetype]) + + safe_url = "".join(safe_url) + self._last_search_url = safe_url + + try: + page = self.browser.get_page(safe_url) + except BrowserError, e: + raise SearchError, "Failed getting %s: %s" % (e.url, e.error) + + return BeautifulSoup(page) + + def _extract_info(self, soup): + empty_info = {'from': 0, 'to': 0, 'total': 0} + div_ssb = soup.find('div', id='ssb') + if not div_ssb: + self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) + return empty_info + p = div_ssb.find('p') + if not p: + self._maybe_raise(ParseError, """

tag within

was not found on Google search page""", soup) + return empty_info + txt = ''.join(p.findAll(text=True)) + txt = txt.replace(',', '') + matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) + if not matches: + return empty_info + return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} + + def _extract_results(self, soup): + # Should extract + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return unichr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return unichr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) class GoogleImageSearch(object): SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s" From 6c0fa59bded7d45240bc18cb5e8447a255020098 Mon Sep 17 00:00:00 2001 From: "nikola.milosevic86" Date: Tue, 15 Jul 2014 12:27:38 +0100 Subject: [PATCH 6/9] Comments deleted --- xgoogle/search.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xgoogle/search.py b/xgoogle/search.py index 41129cf..5252032 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -507,9 +507,7 @@ def _extract_info(self, soup): return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} def _extract_results(self, soup): - # Should extract Date: Tue, 15 Jul 2014 12:37:18 +0100 Subject: [PATCH 7/9] Setup file updated --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1027405..6f79f79 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages import sys -__version__ = '1.3' +__version__ = '1.4' import os def _read(fname): @@ -14,8 +14,8 @@ def _read(fname): long_description=_read('readme.txt'), classifiers=[], keywords='google search', - author='Peteris Krumins', - author_email='peter@catonmat.net', + author='Peteris Krumins, Nikola Milosevic', + author_email='nikola.milosevic@inspiratron.org', url='http://github.com/pkrumins/xgoogle', license='MIT', packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), @@ -25,6 +25,7 @@ def _read(fname): include_package_data=True, zip_safe=False, install_requires=[ + 'nltk==2.0.4' # -*- Extra requirements: -*- ], ) From 8e60e3cc9c185ce8bc743b613eec6b785aed1d30 Mon Sep 17 00:00:00 2001 From: "nikola.milosevic86" Date: Thu, 17 Jul 2014 16:42:24 +0100 Subject: [PATCH 8/9] Change on one example --- examples/ImageExample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ImageExample.py b/examples/ImageExample.py index d625649..3093f6e 100644 --- a/examples/ImageExample.py +++ b/examples/ImageExample.py @@ -5,7 +5,7 @@ from xgoogle.search import GoogleFaceImageSearch, SearchError try: - gs = GoogleFaceImageSearch("quick and dirty") + gs = GoogleFaceImageSearch("Eddard Stark") gs.results_per_page = 50 results = gs.get_results() for res in results: From 13b8d2540e32f03909b232dae686a6ffc397615a Mon Sep 17 00:00:00 2001 From: "nikola.milosevic86" Date: Wed, 10 Sep 2014 19:28:09 +0100 Subject: [PATCH 9/9] Added excerpt to the code of google search. Excerpt is the best match found in the searched web page. This is done by Michele Filannino and myself --- examples/example1.py | 3 ++- examples/exampleVideoSearch.py | 3 +-- xgoogle/search.py | 34 +++++++++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/examples/example1.py b/examples/example1.py index 79e14cb..5121109 100755 --- a/examples/example1.py +++ b/examples/example1.py @@ -6,12 +6,13 @@ from xgoogle.search import GoogleSearch, SearchError try: - gs = GoogleSearch("quick and dirty") + gs = GoogleSearch("game of thrones season 3") gs.results_per_page = 50 results = gs.get_results() for res in results: print res.title.encode('utf8') print res.desc.encode('utf8') + print res.excerpt.encode('utf8') print res.url.encode('utf8') print except SearchError, e: diff --git a/examples/exampleVideoSearch.py b/examples/exampleVideoSearch.py index 807585c..98910b3 100755 --- a/examples/exampleVideoSearch.py +++ b/examples/exampleVideoSearch.py @@ -14,8 +14,7 @@ print 'Date: ' + res.date.encode('utf8') print 'Duration: ' + res.duration.encode('utf8') print 'Author: ' + res.author.encode('utf8') - print 'Description: ' + res.description.encode('utf8') - + print 'Description: ' + res.description.encode('utf8') print except SearchError, e: print "Search failed: %s" % e diff --git a/xgoogle/search.py b/xgoogle/search.py index 5252032..4c3283d 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -73,10 +73,11 @@ def __str__(self): return 'Google Search Result: "%s"' % self.trumb class SearchResult: - def __init__(self, title, url, desc): + def __init__(self, title, url, desc,excerpt): self.title = title self.url = url self.desc = desc + self.excerpt = excerpt def __str__(self): return 'Google Search Result: "%s"' % self.title @@ -269,9 +270,12 @@ def _extract_results(self, soup): def _extract_result(self, result): title, url = self._extract_title_url(result) desc = self._extract_description(result) - if not title or not url or not desc: + if desc == None: + desc = '' + excerpt = self._extract_excerpt(result) + if not title or not url or not (desc or excerpt): return None - return SearchResult(title, url, desc) + return SearchResult(title, url, desc,excerpt) def _extract_title_url(self, result): #title_a = result.find('a', {'class': re.compile(r'\bl\b')}) @@ -286,6 +290,30 @@ def _extract_title_url(self, result): if match: url = urllib.unquote(match.group(1)) return title, url + + def _extract_excerpt(self, result): + def looper(tag): + if not tag: return + for t in tag: + try: + if t.name == 'br': pass + except AttributeError: + pass + try: + desc_strs.append(t.string) + except AttributeError: + desc_strs.append(t) + desc_div = result.find('span', {'class': re.compile(r'\bst\b')}) + if not desc_div: + self._maybe_raise(ParseError, "Content excerpt tag in Google search result was not found", result) + return None + desc_strs = [] + + looper(desc_div) + looper(desc_div.find('wbr')) # BeautifulSoup does not self-close + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) def _extract_description(self, result): desc_div = result.find('div', {'class': re.compile(r'\bs\b')})