diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f3dd5fd --- /dev/null +++ b/.gitattributes @@ -0,0 +1,9 @@ +# Drupal git normalization +# @see http://www.kernel.org/pub/software/scm/git/docs/gitattributes.html +# @see http://drupal.org/node/1542048 + +# Define text file attributes. +# - Ensure no CRLF line-endings, neither on checkout nor on checkin. + +# Auto-detect text files, ensure they use LF. +* text=auto eol=lf diff --git a/.gitignore b/.gitignore index 3102e55..692fe84 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,12 @@ xgoogle.egg-info *~ *.swp +.idea +*.pyc +.project +.pydevproject +.settings + +# Ignore build files. +build/ +dist/ diff --git a/readme.txt b/README.md similarity index 67% rename from readme.txt rename to README.md index 3835b39..5c8c143 100755 --- a/readme.txt +++ b/README.md @@ -1,199 +1,256 @@ -This is a Google library called 'xgoogle'. Current version is 1.3. - -It's written by Peteris Krumins (peter@catonmat.net). -His blog is at http://www.catonmat.net -- good coders code, great reuse. - -The code is licensed under MIT license. - --------------------------------------------------------------------------- - -At the moment it contains: - * Google Search module xgoogle/search.py. - http://www.catonmat.net/blog/python-library-for-google-search/ - - * Google Sponsored Links Search module xgoogle/sponsoredlinks.py - http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/ - - * Google Sets module xgoogle/googlesets.py - http://www.catonmat.net/blog/python-library-for-google-sets/ - - * Google Translate module xgoogle/translate.py - http://www.catonmat.net/blog/python-library-for-google-translate/ - --------------------------------------------------------------------------- - -Here is an example usage of Google Search module: - - >>> from xgoogle.search import GoogleSearch - >>> gs = GoogleSearch("catonmat") - >>> gs.results_per_page = 25 - >>> results = gs.get_results() - >>> for res in results: - ... print res.title.encode('utf8') - ... - - output: - - good coders code, great reuse - MIT's Introduction to Algorithms, Lectures 1 and 2: Analysis of ... - catonmat - Google Code - ... - -The GoogleSearch object has several public methods and properties: - - method get_results() - gets a page of results, returning a list of SearchResult objects. - property num_results - returns number of search results found. - property results_per_page - sets/gets the number of results to get per page. - property page - sets/gets the search page. - -A SearchResult object has three attributes -- "title", "desc", and "url". -They are Unicode strings, so do a proper encoding before outputting them. - --------------------------------------------------------------------------- - -Here is an example usage of Google Sponsored Links Search module: - - >>> from xgoogle.sponsoredlinks import SponsoredLinks, SLError - >>> sl = SponsoredLinks("video software") - >>> sl.results_per_page = 100 - >>> results = sl.get_results() - >>> for result in results: - ... print result.title.encode('utf8') - ... - - output: - - Photoshop Video Software - Video Poker Software - DVD/Video Rental Software - ... - -The SponsoredLinks object has several public methods and properties: - - method get_results() - gets a page of results, returning a list of SearchResult objects. - property num_results - returns number of search results found. - property results_per_page - sets/gets the number of results to get per page. - -A SponsoredLink object has four attributes -- "title", "desc", "url", and "display_url". -They are Unicode strings, don't forget to use a proper encoding before outputting them. - --------------------------------------------------------------------------- - -Here is an example usage of Google Sets module: - - >>> from xgoogle.googlesets import GoogleSets - >>> gs = GoogleSets(['red', 'yellow']) - >>> results = gs.get_results() - >>> print len(results) - >>> for r in results: - ... print r.encode('utf8') - ... - - output: - - red - yellow - blue - white - ... - -The GoogleSets object has only get_results(set_type) public method. The default value -for set_type is SMALL_SET, which makes it return 15 related items or fewer. -Use LARGE_SET to get more than 15 items. This get_results() method returns a list of -related items that are represented as unicode strings. -Don't forget to do the proper encoding when outputting these strings! - -Here is an example showing differences between SMALL_SET and LARGE_SET: - - >>> from xgoogle.googlesets import GoogleSets, LARGE_SET, SMALL_SET - >>> gs = GoogleSets(['python', 'perl']) - >>> results_small = gs.get_results() # SMALL_SET by default - >>> len(results_small) - 11 - >>> results_small - [u'python', u'perl', u'php', u'ruby', u'java', u'javascript', u'c++', u'c', - u'cgi', u'tcl', u'c#'] - >>> - >>> results_large = gs.get_results(LARGE_SET) - >>> len(results_large) - 46 - >>> results_large - [u'perl', u'python', u'java', u'c++', u'php', u'c', u'c#', u'javascript', - u'howto', u'wiki', u'raid', u'dd', u'linux', u'ruby', u'language', u'xml', - u'sgml', u'svn', u'kernel', ...] - - --------------------------------------------------------------------------- - -Here is an example usage of Google Translate module: - - >>> from xgoogle.translate import Translator - >>> - >>> translate = Translator().translate - >>> print translate("Mani sauc Pēteris", lang_to="ru").encode('utf-8') - Меня зовут Петр - >>> print translate("Mani sauc Pēteris", lang_to="en") - My name is Peter - >>> print translate("Меня зовут Петр") - My name is Peter - -The "translate" function takes three arguments - "message", "lang_from" and "lang_to". -If "lang_from" is not given, Google's translation service auto-detects it. -If "lang_to" is not given, it defaults to "en" (English). - -In case of an error the "translate" function throws "TranslationError" exception. -Make sure to wrap your code in try/except block to catch it: - - >>> from xgoogle.translate import Translator, TranslationError - >>> - >>> try: - >>> translate = Translator().translate - >>> print translate("") - >>> except TranslationError, e: - >>> print e - - Failed translating: invalid text - - -The Google Translate module also provides "LanguageDetector" class that can be used -to detect the language of the text. - -Here is an example usage of LanguageDetector: - - >>> from xgoogle.translate import LanguageDetector, DetectionError - >>> - >>> detect = LanguageDetector().detect - >>> english = detect("This is a wonderful library.") - >>> english.lang_code - 'en' - >>> english.lang - 'English' - >>> english.confidence - 0.28078437000000001 - >>> english.is_reliable - True - -The "DetectionError" may get raised if the detection failed. - - --------------------------------------------------------------------------- - - -Version history: - -v1.0: * initial release, xgoogle library contains just the Google Search. -v1.1: * added Google Sponsored Links Search. - * fixed a bug in browser.py that might have thrown an unexpected exception. -v1.2: * added Google Sets module -v1.3: * added Google Translate module - * fixed a bug in browser.py when KeyboardInterrupt did not get propagated. - --------------------------------------------------------------------------- - -That's it. Have fun! :) - - -Sincerely, -Peteris Krumins -http://www.catonmat.net - +xgoogle +======= + +Python wrapper to Google Search service. + +This is an command line search tool, which designed to fetch search results from search engine Google. + +Provide a wrapper for the following services: +* Google Search +* Google Translate + +Forked from [xgoogle](https://github.com/pkrumins/xgoogle) +It was written by Peteris Krumins . +His blog is at http://www.catonmat.net -- good coders code, great reuse. + +Install +======= + +1. Install requirements by: `pip install -r requirements.txt`. + + Note: Use `pip3` if required. + +2. Run build by: `python setup.py build`. +3. Install by: `python setup.py install`. + + Note: Prefix by `sudo` if required. + +Features +======== + +At the moment it contains: + + * Google Search module (xgoogle/search.py). + + http://www.catonmat.net/blog/python-library-for-google-search/ + + * Google Sponsored Links Search module (xgoogle/sponsoredlinks.py) + + http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/ + + * (deprecated) Google Sets module (xgoogle/googlesets.py) + + http://www.catonmat.net/blog/python-library-for-google-sets/ + + Please note that Google Sets has been shut down since Sep 5, 2011 + + * Google Translate module (xgoogle/translate.py) + + http://www.catonmat.net/blog/python-library-for-google-translate/ + + * Google Real-Time Search module (realtime.py) + * Google Image Search (check search.py) + * Google Video Search (check search.py) + +Disclaimer +========== + +Before using, please read Google [Terms of Service](https://www.google.com/intl/en/policies/terms/) + +> Don't misuse our Services. +> For example, don't interfere with our Services or +> try to access them using a method +> other than the interface and the instructions that we provide. + +It is provided for personal study and research. + +Usage +===== + +Google Search +------------- + + +Here is an example usage of Google Search module: + + >>> from xgoogle.search import GoogleSearch + >>> gs = GoogleSearch("catonmat") + >>> gs.results_per_page = 25 + >>> results = gs.get_results() + >>> for res in results: + ... print res.title.encode('utf8') + ... + + output: + + good coders code, great reuse + MIT's Introduction to Algorithms, Lectures 1 and 2: Analysis of ... + catonmat - Google Code + ... + +The GoogleSearch object has several public methods and properties: + + method get_results() - gets a page of results, returning a list of SearchResult objects. + property num_results - returns number of search results found. + property results_per_page - sets/gets the number of results to get per page. + property page - sets/gets the search page. + +A SearchResult object has three attributes -- "title", "desc", and "url". +They are Unicode strings, so do a proper encoding before outputting them. + +Google Sponsored Links Search module +------------------------------------ + +Note: Sponsored Links Search has been changed significantly, so the following example could not work anymore. + +Here is an example usage of Google Sponsored Links Search module: + + >>> from xgoogle.sponsoredlinks import SponsoredLinks, SLError + >>> sl = SponsoredLinks("video software") + >>> sl.results_per_page = 100 + >>> results = sl.get_results() + >>> for result in results: + ... print result.title.encode('utf8') + ... + + output: + + Photoshop Video Software + Video Poker Software + DVD/Video Rental Software + ... + +The SponsoredLinks object has several public methods and properties: + + method get_results() - gets a page of results, returning a list of SearchResult objects. + property num_results - returns number of search results found. + property results_per_page - sets/gets the number of results to get per page. + +A SponsoredLink object has four attributes -- "title", "desc", "url", and "display_url". +They are Unicode strings, don't forget to use a proper encoding before outputting them. + +Google Sets module +------------------ + +Here is an example usage of Google Sets module: + + >>> from xgoogle.googlesets import GoogleSets + >>> gs = GoogleSets(['red', 'yellow']) + >>> results = gs.get_results() + >>> print len(results) + >>> for r in results: + ... print r.encode('utf8') + ... + + output: + + red + yellow + blue + white + ... + +The GoogleSets object has only get_results(set_type) public method. The default value +for set_type is SMALL_SET, which makes it return 15 related items or fewer. +Use LARGE_SET to get more than 15 items. This get_results() method returns a list of +related items that are represented as unicode strings. +Don't forget to do the proper encoding when outputting these strings! + +Here is an example showing differences between SMALL_SET and LARGE_SET: + + >>> from xgoogle.googlesets import GoogleSets, LARGE_SET, SMALL_SET + >>> gs = GoogleSets(['python', 'perl']) + >>> results_small = gs.get_results() # SMALL_SET by default + >>> len(results_small) + 11 + >>> results_small + [u'python', u'perl', u'php', u'ruby', u'java', u'javascript', u'c++', u'c', + u'cgi', u'tcl', u'c#'] + >>> + >>> results_large = gs.get_results(LARGE_SET) + >>> len(results_large) + 46 + >>> results_large + [u'perl', u'python', u'java', u'c++', u'php', u'c', u'c#', u'javascript', + u'howto', u'wiki', u'raid', u'dd', u'linux', u'ruby', u'language', u'xml', + u'sgml', u'svn', u'kernel', ...] + +Google Translate +---------------- +Here is an example usage of Google Translate module: + + >>> from xgoogle.translate import Translator + >>> + >>> translate = Translator().translate + >>> print translate("Mani sauc Pēteris", lang_to="ru").encode('utf-8') + Меня зовут Петр + >>> print translate("Mani sauc Pēteris", lang_to="en") + My name is Peter + >>> print translate("Меня зовут Петр") + My name is Peter + +The "translate" function takes three arguments - "message", "lang_from" and "lang_to". +If "lang_from" is not given, Google's translation service auto-detects it. +If "lang_to" is not given, it defaults to "en" (English). + +In case of an error the "translate" function throws "TranslationError" exception. +Make sure to wrap your code in try/except block to catch it: + + >>> from xgoogle.translate import Translator, TranslationError + >>> + >>> try: + >>> translate = Translator().translate + >>> print translate("") + >>> except TranslationError, e: + >>> print e + + Failed translating: invalid text + + +The Google Translate module also provides "LanguageDetector" class that can be used +to detect the language of the text. + +Here is an example usage of LanguageDetector: + + >>> from xgoogle.translate import LanguageDetector, DetectionError + >>> + >>> detect = LanguageDetector().detect + >>> english = detect("This is a wonderful library.") + >>> english.lang_code + 'en' + >>> english.lang + 'English' + >>> english.confidence + 0.28078437000000001 + >>> english.is_reliable + True + +The "DetectionError" may get raised if the detection failed. + +Google Image Search +------------------- + +Please check example: examples/ImageExample.py + +Google Video Search +------------------- + +Please check example: examples/exampleVideoSearch.py + +Requirements +============ +Requires NLTK for Google video search, for install instruction see: http://www.nltk.org/install.html + +Contributors: +============= +* kenorb (Python 3.x version, maintainance and bug fixes) +* Holger Berndt ('lang', 'tld' args, 'filetype' search, 'last_search_url' property, 'date indexed' search) +* Juanjo Conti (Google Blog Search class) +* Steve Steiner (setup.py) +* azappella (bug fixes) +* Nikola Milosevic (Google Face Image search) +* Ramon Xuriguera (Google Real-Time search) + +License +======= +Licensed under MIT license. diff --git a/contributors.txt b/contributors.txt deleted file mode 100755 index 53e917d..0000000 --- a/contributors.txt +++ /dev/null @@ -1,21 +0,0 @@ -This file contains people who have helped xgoogle project: - - * Holger Berndt - Thanks for adding: - * 'lang' and 'tld' arguments to Google Search - * 'filetype' search - * 'last_search_url' property - * 'date indexed' search - - * Juanjo Conti - Thanks for adding Google Blog Search class - - * Steve Steiner - Thanks for adding setup.py - - * azappella - Thanks for fixing search.py after redesign - - -PS. If I missed you, please email me at peter@catonmat.net, and I'll add you here! - diff --git a/examples/ImageExample.py b/examples/ImageExample.py new file mode 100644 index 0000000..3093f6e --- /dev/null +++ b/examples/ImageExample.py @@ -0,0 +1,17 @@ +# +# This program does a Google search for face images for "quick and dirty" and returns +# 50 results. +# + +from xgoogle.search import GoogleFaceImageSearch, SearchError +try: + gs = GoogleFaceImageSearch("Eddard Stark") + gs.results_per_page = 50 + results = gs.get_results() + for res in results: + print res.trumb.encode('utf8') + print res.url.encode('utf8') + print +except SearchError, e: + print "Search failed: %s" % e + diff --git a/examples/ImageExample2.py b/examples/ImageExample2.py new file mode 100644 index 0000000..afe37bb --- /dev/null +++ b/examples/ImageExample2.py @@ -0,0 +1,17 @@ +# +# This program does a Google search for images for "quick and dirty" and returns +# 50 results. +# + +from xgoogle.search import GoogleImageSearch, SearchError +try: + gs = GoogleImageSearch("quick and dirty") + gs.results_per_page = 50 + results = gs.get_results() + for res in results: + print res.trumb.encode('utf8') + print res.url.encode('utf8') + print +except SearchError, e: + print "Search failed: %s" % e + diff --git a/examples/example1.py b/examples/example1.py index 79e14cb..fea7007 100755 --- a/examples/example1.py +++ b/examples/example1.py @@ -6,13 +6,17 @@ from xgoogle.search import GoogleSearch, SearchError try: - gs = GoogleSearch("quick and dirty") + ask=raw_input("Enter key to be searched:") + gs = GoogleSearch(ask) gs.results_per_page = 50 results = gs.get_results() for res in results: - print res.title.encode('utf8') - print res.desc.encode('utf8') - print res.url.encode('utf8') + if res.title is not None: + print res.title.encode('utf8') + if res.desc is not None: + print res.desc.encode('utf8') + if res.url is not None: + print res.url.encode('utf8') print except SearchError, e: print "Search failed: %s" % e diff --git a/examples/example4.py b/examples/example4.py new file mode 100644 index 0000000..6053d1e --- /dev/null +++ b/examples/example4.py @@ -0,0 +1,34 @@ +#!/usr/bin/python +# +# Justin Vieira (justin@rancorsoft.com) +# http://www.rancorsoft.com -- Let's Rock Together. +# +# This program does a Google search for "super test results" and returns +# all results. +# + +from xgoogle.search import GoogleSearch, SearchError +from threading import Thread +from random import randint +import time + +try: + gs = GoogleSearch("super test results") + gs.results_per_page = 50 + displayedResults = 0 + results = gs.get_results() + while displayedResults < gs.num_results: + for res in results: + if res.title is not None: + print res.title.encode('utf8') + if res.desc is not None: + print res.desc.encode('utf8') + if res.url is not None: + print res.url.encode('utf8') + displayedResults += gs.results_per_page + print + time.sleep(randint(15,60)) + results = gs.get_results() +except SearchError, e: + print "Search failed: %s" % e + diff --git a/examples/exampleVideoSearch.py b/examples/exampleVideoSearch.py new file mode 100755 index 0000000..807585c --- /dev/null +++ b/examples/exampleVideoSearch.py @@ -0,0 +1,22 @@ +# +# This program does a Google search for video for "Iron Maiden" and returns +# 50 results. Video search requires NLTK. For instruction on installation please visit http://www.nltk.org/install.html +# + +from xgoogle.search import GoogleVideoSearch, SearchError +try: + gs = GoogleVideoSearch("Iron Maiden") + gs.results_per_page = 50 + results = gs.get_results() + for res in results: + print 'Name: ' + res.name.encode('utf8') + print 'URL: ' + res.url.encode('utf8') + print 'Date: ' + res.date.encode('utf8') + print 'Duration: ' + res.duration.encode('utf8') + print 'Author: ' + res.author.encode('utf8') + print 'Description: ' + res.description.encode('utf8') + + print +except SearchError, e: + print "Search failed: %s" % e + diff --git a/projects-using-xgoogle.txt b/projects-using-xgoogle.txt index bd32019..5e5cce5 100755 --- a/projects-using-xgoogle.txt +++ b/projects-using-xgoogle.txt @@ -1,21 +1,15 @@ -Here is a list of projects that other people have done, that use my library: - -* fimap -- http://code.google.com/p/fimap/ - - fimap is a little python tool which can find, prepare, audit, exploit and - even google automaticly for local and remote file inclusion bugs in webapps. - -* translate.org.za -- http://www.translate.org.za/ - - Translate.org.za is focused on the localisation, or translation, of Open - Source software into South Africa's 11 official languages and localisations - of GNOME, KDE, OpenOffice.org, Firefox and Thunderbird. - -* Top40 -- http://github.com/staticd/Top40 - - This program grabs the top 40 artists played on Alt Nation (a Sirius XM - radio channel) over the past week and runs a Google search on each artist in - an attempt to find links to free music from the artist being searched. - Thanks to xgoogle, I have an interface with Google to make this program - successful. - +Here is a list of projects that other people have done, that use this library: + +* fimap -- http://code.google.com/p/fimap/ + + fimap is a little python tool which can find, prepare, audit, exploit and + even google automaticly for local and remote file inclusion bugs in webapps. + +* Top40 -- http://github.com/staticd/Top40 + + This program grabs the top 40 artists played on Alt Nation (a Sirius XM + radio channel) over the past week and runs a Google search on each artist in + an attempt to find links to free music from the artist being searched. + Thanks to xgoogle, I have an interface with Google to make this program + successful. + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2738f9f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# pip install -r requirements.txt +beautifulsoup4==4.4.1 +chardet==2.3.0 +colorama==0.3.2 +html5lib==0.999 +nltk==3.0.5 +requests==2.4.3 +six==1.10.0 +urllib3==1.9.1 +wheel==0.24.0 diff --git a/setup.py b/setup.py index 1027405..b928cd9 100644 --- a/setup.py +++ b/setup.py @@ -1,17 +1,17 @@ from setuptools import setup, find_packages import sys -__version__ = '1.3' +__version__ = '1.4' import os def _read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() + return open(os.path.join(os.path.dirname(__file__), fname), encoding='utf-8').read() setup( name='xgoogle', version=__version__, - description="Python library to Google services (google search, google sets, google translate, sponsored links)", - long_description=_read('readme.txt'), + description="Python library to Google services (Google Search, Google Images, Google Videos, Google Translate, Google Real-Time)", + long_description=_read('README.md'), classifiers=[], keywords='google search', author='Peteris Krumins', @@ -25,6 +25,8 @@ def _read(fname): include_package_data=True, zip_safe=False, install_requires=[ - # -*- Extra requirements: -*- + # -*- Extra requirements: -*- + 'beautifulsoup4>=4.0', + 'nltk>=3.0' ], ) diff --git a/tests/test_issue_16.py b/tests/test_issue_16.py new file mode 100644 index 0000000..9ada5c9 --- /dev/null +++ b/tests/test_issue_16.py @@ -0,0 +1,12 @@ +# coding: utf-8 +""" +Simulate: + google_dl -s http://www.marquette.edu/maqom/ -f pdf "" +""" +from xgoogle.search import GoogleSearch + + +def test_name_error_name2codepoint(): + gs = GoogleSearch('site:http://www.marquette.edu/maqom/') + gs._set_filetype('pdf') + assert gs.get_results() diff --git a/xgoogle/BeautifulSoup.py b/xgoogle/BeautifulSoup.py index 0e55aba..9463e7b 100755 --- a/xgoogle/BeautifulSoup.py +++ b/xgoogle/BeautifulSoup.py @@ -89,9 +89,9 @@ import re import sgmllib try: - from htmlentitydefs import name2codepoint + from html.entities import name2codepoint except ImportError: - name2codepoint = {} + from htmlentitydefs import name2codepoint #This hack makes Beautiful Soup able to parse XML with namespaces sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') @@ -401,7 +401,7 @@ def __getattr__(self, attr): if attr == 'string': return self else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)) def __unicode__(self): return str(self).decode(DEFAULT_OUTPUT_ENCODING) @@ -497,10 +497,10 @@ def __init__(self, parser, name, attrs=None, parent=None, self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) + convert = lambda k_val: (k_val[0], + re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, + k_val[1])) self.attrs = map(convert, self.attrs) def get(self, key, default=None): @@ -569,7 +569,7 @@ def __getattr__(self, tag): return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + raise AttributeError("'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, @@ -880,8 +880,7 @@ def search(self, markup): if self._matches(markup, self.text): found = markup else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + raise Exception("I don't know how to match against a %s" % (markup.__class__)) return found def _matches(self, markup, matchAgainst): @@ -1258,7 +1257,7 @@ def unknown_starttag(self, name, attrs, selfClosing=0): if self.quoteStack: #This is not a real tag. #print "<%s> is not real!" % name - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + attrs = ''.join(map(lambda x_y: ' %s="%s"' % (x_y[0], x_y[1]), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return self.endData() @@ -1523,7 +1522,7 @@ def start_meta(self, attrs): # else an encoding was specified explicitly and it # worked. Rewrite the meta tag. newAttr = self.CHARSET_RE.sub\ - (lambda(match):match.group(1) + + (lambda match:match.group(1) + "%SOUP-ENCODING%", contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) @@ -1742,7 +1741,7 @@ def _convertFrom(self, proposed): "iso-8859-1", "iso-8859-2"): markup = re.compile("([\x80-\x9f])").sub \ - (lambda(x): self._subMSChar(x.group(1)), + (lambda x: self._subMSChar(x.group(1)), markup) try: @@ -1750,7 +1749,7 @@ def _convertFrom(self, proposed): u = self._toUnicode(markup, proposed) self.markup = u self.originalEncoding = proposed - except Exception, e: + except Exception(e): # print "That didn't work!" # print e return None @@ -1928,4 +1927,4 @@ def _ebcdic_to_ascii(self, s): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin.read()) - print soup.prettify() + print(soup.prettify()) diff --git a/xgoogle/browser.py b/xgoogle/browser.py index c7d2618..75159eb 100755 --- a/xgoogle/browser.py +++ b/xgoogle/browser.py @@ -1,105 +1,142 @@ -#!/usr/bin/python -# -# Peteris Krumins (peter@catonmat.net) -# http://www.catonmat.net -- good coders code, great reuse -# -# http://www.catonmat.net/blog/python-library-for-google-search/ -# -# Code is licensed under MIT license. -# - -import random -import socket -import urllib -import urllib2 -import httplib - -BROWSERS = ( - # Top most popular browsers in my access.log on 2009.02.12 - # tail -50000 access.log | - # awk -F\" '{B[$6]++} END { for (b in B) { print B[b] ": " b } }' | - # sort -rn | - # head -20 - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6', - 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.6) Gecko/2009011912 Firefox/3.0.6', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)', - 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6', - 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)', - 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.48 Safari/525.19', - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)', - 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6', - 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5', - 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1', - 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)', - 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)', - 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' -) - -TIMEOUT = 5 # socket timeout - -class BrowserError(Exception): - def __init__(self, url, error): - self.url = url - self.error = error - -class PoolHTTPConnection(httplib.HTTPConnection): - def connect(self): - """Connect to the host and port specified in __init__.""" - msg = "getaddrinfo returns an empty list" - for res in socket.getaddrinfo(self.host, self.port, 0, - socket.SOCK_STREAM): - af, socktype, proto, canonname, sa = res - try: - self.sock = socket.socket(af, socktype, proto) - if self.debuglevel > 0: - print "connect: (%s, %s)" % (self.host, self.port) - self.sock.settimeout(TIMEOUT) - self.sock.connect(sa) - except socket.error, msg: - if self.debuglevel > 0: - print 'connect fail:', (self.host, self.port) - if self.sock: - self.sock.close() - self.sock = None - continue - break - if not self.sock: - raise socket.error, msg - -class PoolHTTPHandler(urllib2.HTTPHandler): - def http_open(self, req): - return self.do_open(PoolHTTPConnection, req) - -class Browser(object): - def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False): - self.headers = { - 'User-Agent': user_agent, - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-us,en;q=0.5' - } - self.debug = debug - - def get_page(self, url, data=None): - handlers = [PoolHTTPHandler] - opener = urllib2.build_opener(*handlers) - if data: data = urllib.urlencode(data) - request = urllib2.Request(url, data, self.headers) - try: - response = opener.open(request) - return response.read() - except (urllib2.HTTPError, urllib2.URLError), e: - raise BrowserError(url, str(e)) - except (socket.error, socket.sslerror), msg: - raise BrowserError(url, msg) - except socket.timeout, e: - raise BrowserError(url, "timeout") - except KeyboardInterrupt: - raise - except: - raise BrowserError(url, "unknown error") - - def set_random_user_agent(self): - self.headers['User-Agent'] = random.choice(BROWSERS) - return self.headers['User-Agent'] - +#!/usr/bin/python +# -*- coding: utf8 -*- +# +# Peteris Krumins (peter@catonmat.net) +# http://www.catonmat.net -- good coders code, great reuse +# +# http://www.catonmat.net/blog/python-library-for-google-search/ +# +# Code is licensed under MIT license. +# + +import sys +import ssl +import random +import socket +import urllib +import urllib.request +import http.client +import http.cookiejar +import http.cookies + + +BROWSERS = ( + # Top most popular browsers in my access.log on 2009.02.12 + # tail -50000 access.log | + # awk -F\" '{B[$6]++} END { for (b in B) { print B[b] ": " b } }' | + # sort -rn | + # head -20 + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0) Gecko/20100101 Firefox/4.0', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)', + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6', + 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.6) Gecko/2009011912 Firefox/3.0.6', + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)', + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6', + 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6', + 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)', + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.48 Safari/525.19', + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)', + 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6', + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5', + 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)', + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' +) + +TIMEOUT_SOCKET = 5 # socket timeout + +class BrowserError(Exception): + def __init__(self, url, error): + self.url = url + self.error = error + +class PoolHTTPConnection(http.client.HTTPConnection): + def connect(self): + """Connect to the host and port specified in __init__.""" + global TIMEOUT_SOCKET + msg = "getaddrinfo returns an empty list" + for res in socket.getaddrinfo(self.host, self.port, 0, + socket.SOCK_STREAM): + af, socktype, proto, canonname, sa = res + try: + self.sock = socket.socket(af, socktype, proto) + if self.debuglevel > 0: + print("connect: (%s, %s)" % (self.host, self.port)) + self.sock.settimeout(TIMEOUT_SOCKET) + self.sock.connect(sa) + except socket.error as msg: + if self.debuglevel > 0: + print('connect fail:', (self.host, self.port)) + if self.sock: + self.sock.close() + self.sock = None + continue + break + if not self.sock: + raise socket.error(msg) + +class PoolHTTPHandler(urllib.request.HTTPHandler): + def http_open(self, req): + return self.do_open(PoolHTTPConnection, req) + +class Browser(object): + """Provide a simulated browser object. + """ + def __init__(self, timeout, user_agent=BROWSERS[0], debug=False, use_pool=False): + global TIMEOUT_SOCKET + TIMEOUT_SOCKET = timeout + self.headers = { + 'User-Agent': user_agent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6', + # 'Accept-Encoding': 'deflate' + } + self.debug = debug + self._cj = http.cookiejar.CookieJar() + + self.handlers = [PoolHTTPHandler] + self.handlers.append(urllib.request.HTTPCookieProcessor(self._cj)) + + self.opener = urllib.request.build_opener(*self.handlers) + self.opener.addheaders = [] + + ssl._create_default_https_context = ssl._create_unverified_context + + try: + conn = self.opener.open("http://www.google.com/ncr") + conn.info() # retrieve session cookie + except Exception as e: + print(e) + + def get_page(self, url, data=None): + # handlers = [PoolHTTPHandler] + # opener = urllib.request.build_opener(*handlers) + if data: data = urllib.urlencode(data) + request = urllib.request.Request(url, data, self.headers) + try: + response = self.opener.open(request) + return response.read() + except urllib.error.HTTPError as e: + # Check if we've reached the captcha + if e.code == 503: + print("Error: Captcha page has been reached, exiting...") + sys.exit(1) + raise BrowserError(url, str(e)) + except urllib.error.URLError as e: + raise BrowserError(url, str(e)) + except (socket.error, ssl.SSLError) as msg: + raise BrowserError(url, msg) + except socket.timeout as e: + raise BrowserError(url, "timeout") + except KeyboardInterrupt: + raise + except: + raise BrowserError(url, "unknown error") + + def set_random_user_agent(self): + self.headers['User-Agent'] = random.choice(BROWSERS) + return self.headers['User-Agent'] + + def get_user_agent(self): + return self.headers['User-Agent'] diff --git a/xgoogle/googlesets.py b/xgoogle/googlesets.py index af3d1d0..ca3a305 100755 --- a/xgoogle/googlesets.py +++ b/xgoogle/googlesets.py @@ -1,89 +1,87 @@ -#!/usr/bin/python -# -# Peteris Krumins (peter@catonmat.net) -# http://www.catonmat.net -- good coders code, great reuse -# -# http://www.catonmat.net/blog/python-library-for-google-sets/ -# -# Code is licensed under MIT license. -# - -import re -import urllib -import random -from htmlentitydefs import name2codepoint -from BeautifulSoup import BeautifulSoup - -from browser import Browser, BrowserError - -class GSError(Exception): - """ Google Sets Error """ - pass - -class GSParseError(Exception): - """ - Parse error in Google Sets results. - self.msg attribute contains explanation why parsing failed - self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse - Thrown only in debug mode - """ - - def __init__(self, msg, tag): - self.msg = msg - self.tag = tag - - def __str__(self): - return self.msg - - def html(self): - return self.tag.prettify() - -LARGE_SET = 1 -SMALL_SET = 2 - -class GoogleSets(object): - URL_LARGE = "http://labs.google.com/sets?hl=en&q1=%s&q2=%s&q3=%s&q4=%s&q5=%s&btn=Large+Set" - URL_SMALL = "http://labs.google.com/sets?hl=en&q1=%s&q2=%s&q3=%s&q4=%s&q5=%s&btn=Small+Set+(15+items+or+fewer)" - - def __init__(self, items, random_agent=False, debug=False): - self.items = items - self.debug = debug - self.browser = Browser(debug=debug) - - if random_agent: - self.browser.set_random_user_agent() - - def get_results(self, set_type=SMALL_SET): - page = self._get_results_page(set_type) - results = self._extract_results(page) - return results - - def _maybe_raise(self, cls, *arg): - if self.debug: - raise cls(*arg) - - def _get_results_page(self, set_type): - if set_type == LARGE_SET: - url = GoogleSets.URL_LARGE - else: - url = GoogleSets.URL_SMALL - - safe_items = [urllib.quote_plus(i) for i in self.items] - blank_items = 5 - len(safe_items) - if blank_items > 0: - safe_items += ['']*blank_items - - safe_url = url % tuple(safe_items) - - try: - page = self.browser.get_page(safe_url) - except BrowserError, e: - raise GSError, "Failed getting %s: %s" % (e.url, e.error) - - return BeautifulSoup(page) - - def _extract_results(self, soup): - a_links = soup.findAll('a', href=re.compile('/search')) - ret_res = [a.string for a in a_links] - return ret_res - +#!/usr/bin/python +# +# Peteris Krumins (peter@catonmat.net) +# http://www.catonmat.net -- good coders code, great reuse +# +# http://www.catonmat.net/blog/python-library-for-google-sets/ +# +# Code is licensed under MIT license. +# + +import re +import urllib +from BeautifulSoup import BeautifulSoup + +from browser import Browser, BrowserError + +class GSError(Exception): + """ Google Sets Error """ + pass + +class GSParseError(Exception): + """ + Parse error in Google Sets results. + self.msg attribute contains explanation why parsing failed + self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse + Thrown only in debug mode + """ + + def __init__(self, msg, tag): + self.msg = msg + self.tag = tag + + def __str__(self): + return self.msg + + def html(self): + return self.tag.prettify() + +LARGE_SET = 1 +SMALL_SET = 2 + +class GoogleSets(object): + URL_LARGE = "http://labs.google.com/sets?hl=en&q1=%s&q2=%s&q3=%s&q4=%s&q5=%s&btn=Large+Set" + URL_SMALL = "http://labs.google.com/sets?hl=en&q1=%s&q2=%s&q3=%s&q4=%s&q5=%s&btn=Small+Set+(15+items+or+fewer)" + + def __init__(self, items, random_agent=False, debug=False): + self.items = items + self.debug = debug + self.browser = Browser(debug=debug) + + if random_agent: + self.browser.set_random_user_agent() + + def get_results(self, set_type=SMALL_SET): + page = self._get_results_page(set_type) + results = self._extract_results(page) + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self, set_type): + if set_type == LARGE_SET: + url = GoogleSets.URL_LARGE + else: + url = GoogleSets.URL_SMALL + + safe_items = [urllib.quote_plus(i) for i in self.items] + blank_items = 5 - len(safe_items) + if blank_items > 0: + safe_items += ['']*blank_items + + safe_url = url % tuple(safe_items) + + try: + page = self.browser.get_page(safe_url) + except BrowserError(e): + raise GSError("Failed getting %s: %s" % (e.url, e.error)) + + return BeautifulSoup(page) + + def _extract_results(self, soup): + a_links = soup.findAll('a', href=re.compile('/search')) + ret_res = [a.string for a in a_links] + return ret_res + diff --git a/xgoogle/realtime.py b/xgoogle/realtime.py new file mode 100644 index 0000000..e1e57a1 --- /dev/null +++ b/xgoogle/realtime.py @@ -0,0 +1,254 @@ +#!/usr/bin/python +# encoding: utf-8 +# +# Peteris Krumins (peter@catonmat.net) +# http://www.catonmat.net -- good coders code, great reuse +# +# http://www.catonmat.net/blog/python-library-for-google-search/ +# +# Code is licensed under MIT license. +# + +import re +from datetime import datetime +import time +import urllib +from BeautifulSoup import BeautifulSoup + +try: + from html.entities import name2codepoint +except ImportError: + from htmlentitydefs import name2codepoint + +from search import SearchError +from browser import Browser, BrowserError + +class RealtimeSearchError(SearchError): + """ + Base class for Google Realtime Search exceptions. + """ + pass + +class CaptchaError(SearchError): + pass + +class RealtimeResult: + def __init__(self, screen_name, status, timestamp, id, keywords=None): + self.screen_name = screen_name + self.status = status + self.timestamp = timestamp + self.id = id + self.keywords = keywords + + def __str__(self): + return 'Realtime Result:\n\t%s\n\t%s\n\t%s' % (self.screen_name, self.status, self.timestamp) + +class RealtimeSearch(object): + BASE_URL = "http://www.google.%(tld)s" + SEARCH_URL = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search&tbs=mbl:1" + DAY = 86400 + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", older=None, interval=43200): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug) + self.eor = False # end of results + self._page = 0 + self._last_search_url = None + self._lang = lang + self._tld = tld + self._interval = interval + + self.older = older + + if random_agent: + self.browser.set_random_user_agent() + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _set_older(self, older): + self._older = older + + def _get_older(self): + return self._older + + older = property(_get_older, _set_older) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + + page = self._get_results_page() + + # Check captcha + if self._check_captcha(page): + raise CaptchaError("Found Captcha") + + results = self._extract_results(page) + self._page += 1 + + # Get older link + self.older = self._extract_older_link(page) + if not self.older: + raise RealtimeSearchError("Could not compute older results' link") + + return results + + def _check_captcha(self, page): + form = page.find('form', {'action':'Captcha'}) + return form != None + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if not self.older: + url = RealtimeSearch.SEARCH_URL + safe_url = [url % { 'query': urllib.quote_plus(self.query), + 'tld': self._tld, + 'lang' : self._lang }] + safe_url = "".join(safe_url) + self.older = safe_url + else: + safe_url = self.older + + self._last_search_url = safe_url + try: + page = self.browser.get_page(safe_url) + except BrowserError(e): + raise RealtimeSearchError("Failed getting %s: %s" % (e.url, e.error)) + return BeautifulSoup(page) + + def _extract_results(self, soup): + ret_res = [] + results_wrapper = soup.find('div', {'id':'ires'}) + if not results_wrapper: + return ret_res + results = results_wrapper.findAll('li', {'class': 'g s'}) + for result in results: + eres = self._extract_result(result) + if eres: + ret_res.append(eres) + return ret_res + + def _extract_result(self, result): + try: + screen_name, status, keywords = self._extract_status(result) + timestamp = self._extract_status_timestamp(result) + id = self._extract_status_id(result) + except ValueError: + return None + + if not screen_name or not status or not timestamp: + return None + return RealtimeResult(screen_name, status, timestamp, id, keywords) + + def _extract_status(self, result): + div = result.find('div', {'class':None, 'style':None}) + div_text = div.findAll(text=True) + screen_name = self._html_unescape(div_text.pop(0)) + status = self._html_unescape(''.join(div_text)) + ems = div.findAll('em') + keywords = [] + for em in ems: + keywords.append(self._html_unescape(''.join(em.findAll(text=True)))) + return screen_name, status, keywords + + def _extract_status_timestamp(self, result): + span = result.find('span', {'class':'f rtdm'}) + delta = span.find('div', {'class':'rtdelta'}) + if delta: + timestamp = time.time() - int(delta.find(text=True)) + timestamp = time.gmtime(timestamp) + else: + timestamp = span.find(text=True) + # Timestamp example: + # Mar 29, 2011 2:17:05 AM + # %b %d, %Y %I:%M:%S %p + timestamp = time.strptime(timestamp.strip(), '%b %d, %Y %I:%M:%S %p') + + return datetime.fromtimestamp(time.mktime(timestamp)) + + def _extract_status_id(self, result): + link = result.find('a', {'href':re.compile('/status/')}) + if not link: + return None + id = re.findall('status/([\d]*)', link['href']) + if not id: + return None + return long(id[0]) + + def _extract_older_link(self, soup): + url = RealtimeSearch.BASE_URL + safe_url = url % {'tld':self._tld} + + # Try to get the older link + links = soup.find('div', {'class':'s'}) + if links: + links = links.findAll('a') + if links and links[0]['href']: + return ''.join([safe_url, links[0]['href']]) + + # Change the interval to get older tweets + return self._change_interval(self.older) + + def _change_interval(self, current_url): + regex = 'mbl_hs:(?P[\d]*),mbl_he:(?P[\d]*),mbl_rs:(?P[\d]*),mbl_re:(?P[\d]*),' + matchobj = re.search(regex, current_url) + + if not matchobj: + return None + + int_hs, int_he, int_rs, int_re = matchobj.group('hs', 'he', 'rs', 're') + + # Set new interval +# int_re_n = int_rs + + int_re_n = str(int(int_re) - self._interval) + int_rs_n = str(int(int_rs) - self._interval) + + int_hs_n = str(int(int_hs) - self._interval) + int_he_n = str(int(int_he) - self._interval) + +# if int_rs_n < int_hs: +# int_hs_n = str(int(int_hs) - RealtimeSearch.DAY) +# int_he_n = str(int(int_he) - RealtimeSearch.DAY) + + # Replace the parameters in the url + current_url = re.sub(int_hs, int_hs_n, current_url) + current_url = re.sub(int_he, int_he_n, current_url) + current_url = re.sub(int_rs, int_rs_n, current_url) + current_url = re.sub(int_re, int_re_n, current_url) + + return current_url + + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return unichr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return unichr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) diff --git a/xgoogle/search.py b/xgoogle/search.py index 98b681e..7b26199 100755 --- a/xgoogle/search.py +++ b/xgoogle/search.py @@ -1,8 +1,8 @@ -#!/usr/bin/python -# encoding: utf-8 +#!/usr/bin/env python3 +# -*- coding: utf8 -*- # -# Peteris Krumins (peter@catonmat.net) -# http://www.catonmat.net -- good coders code, great reuse +# Peteris Krumins (peter@catonmat.net) [http://www.catonmat.net] +# Updated by Nikola Milosevic (nikola.milosevic@inspiratron.org) [http://www.inspiratron.org] # # http://www.catonmat.net/blog/python-library-for-google-search/ # @@ -11,10 +11,15 @@ import re import urllib -from htmlentitydefs import name2codepoint -from BeautifulSoup import BeautifulSoup +from bs4 import BeautifulSoup +import nltk -from browser import Browser, BrowserError +try: + from html.entities import name2codepoint +except ImportError: + from htmlentitydefs import name2codepoint + +from xgoogle.browser import Browser, BrowserError class SearchError(Exception): """ @@ -29,7 +34,7 @@ class ParseError(SearchError): self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse Thrown only in debug mode """ - + def __init__(self, msg, tag): self.msg = msg self.tag = tag @@ -40,14 +45,61 @@ def __str__(self): def html(self): return self.tag.prettify() + +# videoname = nltk.clean_html(str(h3[0])) +# video_url = result.findAll('cite') +# date_and_author = result.find('div',{'class':'f slp'}) +# da = str.split(str(date_and_author,' - Uploaded by')) +# date = da[0] +# author = da[1] +# desc = result.find('span',{'class':'st'}) +# description = nltk.clean_html(str(desc)) +class FaceVideoSearchResult: + def __init__(self, name, url, description,date,duration,author): + self.name = name + self.url = url + self.description = description + self.date = date + self.duration = duration + self.author= author + + def __str__(self): + return 'Google Search Result: "%s"' % self.name + +class FaceImageSearchResult: + def __init__(self, trumb, url): + self.url = url + self.trumb = trumb + + def __str__(self): + return 'Google Search Result: "%s"' % self.trumb + class SearchResult: - def __init__(self, title, url, desc): + def __init__(self, title='', url='', desc=''): self.title = title self.url = url self.desc = desc + def getURL(self): + return self.url + + def setURL(self, url): + self.url = url + + def getTitle(self): + return self.title + + def setTitle(self, title): + self.title = title + + def getdesc(self): + return self.desc + + def setdesc(self, desc): + self.desc = desc + def __str__(self): - return 'Google Search Result: "%s"' % self.title + return 'Google Search Result: "%s"' % self.url class GoogleSearch(object): SEARCH_URL_0 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search" @@ -55,10 +107,10 @@ class GoogleSearch(object): SEARCH_URL_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search" NEXT_PAGE_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" - def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): + def __init__(self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None, repeat=None, timeout=5): self.query = query self.debug = debug - self.browser = Browser(debug=debug) + self.browser = Browser(debug=debug, timeout=timeout) self.results_info = None self.eor = False # end of results self._page = 0 @@ -69,11 +121,12 @@ def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", self._last_from = 0 self._lang = lang self._tld = tld - + self.repeat = repeat + if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": - self._re_search_strings = ("Ergebnisse", "von", u"ungefähr") + self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here @@ -121,19 +174,19 @@ def _set_first_indexed_in_previous(self, interval): try: num = float(interval) except ValueError: - raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval)) + raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) self._first_indexed_in_previous = 'm' + str(interval) - + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") - + def _get_filetype(self): return self._filetype def _set_filetype(self, filetype): self._filetype = filetype - + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") - + def _get_results_per_page(self): return self._results_per_page @@ -175,6 +228,7 @@ def _maybe_raise(self, cls, *arg): raise cls(*arg) def _get_results_page(self): + """Construct search url, and get the page content""" if self._page == 0: if self._results_per_page == 10: url = GoogleSearch.SEARCH_URL_0 @@ -186,29 +240,34 @@ def _get_results_page(self): else: url = GoogleSearch.NEXT_PAGE_1 - safe_url = [url % { 'query': urllib.quote_plus(self.query), + safe_url = [url % { 'query': urllib.parse.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page, 'tld' : self._tld, 'lang' : self._lang }] - + # possibly extend url with optional properties if self._first_indexed_in_previous: safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) if self._filetype: safe_url.extend(["&as_filetype=", self._filetype]) - + if self.repeat: + safe_url.extend(["&filter=", '0']) + safe_url = "".join(safe_url) self._last_search_url = safe_url - + try: page = self.browser.get_page(safe_url) - except BrowserError, e: - raise SearchError, "Failed getting %s: %s" % (e.url, e.error) + except BrowserError as e: + raise SearchError("Failed getting %s: %s" % (e.url, e.error)) - return BeautifulSoup(page) + return BeautifulSoup(page, "html.parser") def _extract_info(self, soup): + """Extract total results + Page X of about XXX results + """ empty_info = {'from': 0, 'to': 0, 'total': 0} div_ssb = soup.find('div', id='ssb') if not div_ssb: @@ -226,7 +285,8 @@ def _extract_info(self, soup): return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} def _extract_results(self, soup): - results = soup.findAll('li', {'class': 'g'}) + """Extract results from the page""" + results = soup.findAll('div','g') ret_res = [] for result in results: eres = self._extract_result(result) @@ -235,12 +295,293 @@ def _extract_results(self, soup): return ret_res def _extract_result(self, result): + """Extract title,url,desc for a result""" title, url = self._extract_title_url(result) desc = self._extract_description(result) - if not title or not url or not desc: + if not title and not url: return None return SearchResult(title, url, desc) + def _extract_title_url(self, result): + #title_a = result.find('a', {'class': re.compile(r'\bl\b')}) + #title_a = result.find('h3').find('a') + title_a = result.find('a') + if not title_a: + self._maybe_raise(ParseError, "Title tag in Google search result was not found", result) + return None, None + title = ''.join(title_a.findAll(text=True)) + title = self._html_unescape(title) + url = title_a['href'] + match = re.match(r'/url\?q=((http|ftp|https)[^&]+)&', url) + if match: + url = urllib.parse.unquote(match.group(1)) + match = re.match(r'/interstitial\?url=((http|ftp|https)[^&]+)&', url) + if match: + url = urllib.parse.unquote(match.group(1)) + return title, url + + def _extract_description(self, result): + """Seems this is enough""" + desc = result.find('span', {'class': 'st'}) + return desc + + desc_div = result.find('span', 'st') + if not desc_div: + self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) + return None + desc_span = desc_div.find('span', {'class': 'st'}) + if not desc_span: + self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) + return None + + desc_strs = [] + def looper(tag): + if not tag: return + for t in tag: + try: + if t.name == 'br': continue + except AttributeError: + pass + + try: + desc_strs.append(t.string) + except AttributeError: + desc_strs.append(t) + + looper(desc_span) + looper(desc_span.find('wbr')) # BeautifulSoup does not self-close + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return chr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return chr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + +class GoogleVideoSearch(object): + SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s" + NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&start=%(start)d" + SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d" + NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug, timeout=timeout) + self.results_info = None + self.eor = False # end of results + self._page = 0 + self._first_indexed_in_previous = None + self._filetype = None + self._last_search_url = None + self._results_per_page = 10 + self._last_from = 0 + self._lang = lang + self._tld = tld + + if re_search_strings: + self._re_search_strings = re_search_strings + elif lang == "de": + self._re_search_strings = ("Ergebnisse", "von", "ungefähr") + elif lang == "es": + self._re_search_strings = ("Resultados", "de", "aproximadamente") + # add more localised versions here + else: + self._re_search_strings = ("Results", "of", "about") + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _get_first_indexed_in_previous(self): + return self._first_indexed_in_previous + + def _set_first_indexed_in_previous(self, interval): + if interval == "day": + self._first_indexed_in_previous = 'd' + elif interval == "week": + self._first_indexed_in_previous = 'w' + elif interval == "month": + self._first_indexed_in_previous = 'm' + elif interval == "year": + self._first_indexed_in_previous = 'y' + else: + # a floating point value is a number of months + try: + num = float(interval) + except ValueError: + raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) + self._first_indexed_in_previous = 'm' + str(interval) + + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") + + def _get_filetype(self): + return self._filetype + + def _set_filetype(self, filetype): + self._filetype = filetype + + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + MAX_VALUE = 1000000 + page = self._get_results_page() + results = self._extract_results(page) + search_info = {'from': self.results_per_page*self._page, + 'to': self.results_per_page*self._page + len(results), + 'total': MAX_VALUE} + if not self.results_info: + self.results_info = search_info + if self.num_results == 0: + self.eor = True + return [] + if not results: + self.eor = True + return [] + if self._page > 0 and search_info['from'] == self._last_from: + self.eor = True + return [] + if search_info['to'] == search_info['total']: + self.eor = True + self._page += 1 + self._last_from = search_info['from'] + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = GoogleVideoSearch.SEARCH_URL_0 + else: + url = GoogleVideoSearch.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = GoogleVideoSearch.NEXT_PAGE_0 + else: + url = GoogleVideoSearch.NEXT_PAGE_1 + + safe_url = [url % { 'query': urllib.parse.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page, + 'tld' : self._tld, + 'lang' : self._lang }] + + # possibly extend url with optional properties + if self._first_indexed_in_previous: + safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) + if self._filetype: + safe_url.extend(["&as_filetype=", self._filetype]) + + safe_url = "".join(safe_url) + self._last_search_url = safe_url + + try: + page = self.browser.get_page(safe_url) + except BrowserError as e: + raise SearchError("Failed getting %s: %s" % (e.url, e.error)) + + return BeautifulSoup(page) + + def _extract_info(self, soup): + empty_info = {'from': 0, 'to': 0, 'total': 0} + div_ssb = soup.find('div', id='ssb') + if not div_ssb: + self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) + return empty_info + p = div_ssb.find('p') + if not p: + self._maybe_raise(ParseError, """

tag within

was not found on Google search page""", soup) + return empty_info + txt = ''.join(p.findAll(text=True)) + txt = txt.replace(',', '') + matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) + if not matches: + return empty_info + return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} + + def _extract_results(self, soup): + results = soup.findAll('li', {"class" : re.compile(r'\b(g videobox|g)\b')}) + ret_res = [] + for result in results: + eres = self._extract_result(result) + if eres: + ret_res.append(eres) + return ret_res + + def _extract_result(self, result): + + h3=result.findAll('h3') + name = '' + for lonuri in h3: + name = name + str(lonuri) + videoname = nltk.clean_html(str(name)) + video_url = BeautifulSoup(str(h3)).findAll('a') + url = str.split(video_url[0]['href'][7:].encode('utf8'),'&')[0] + url = url.replace('%3F', '?') + url = url.replace('%3D', '=') + desc = result.find('span',{'class':'st'}) + meta = result.find('span',{'class':'f'}) + author = '' + duration = '' + date = '' + if(not meta== None): + metastr= nltk.clean_html(str(meta)) + metaarr = metastr.split('-') + date = metaarr[0] + duration = metaarr[1] + author = metaarr[2][13:] + description = nltk.clean_html(str(desc)) + return FaceVideoSearchResult(videoname,url,description,date,duration,author) + #return FaceImageSearchResult(trumnail, image) + def _extract_title_url(self, result): #title_a = result.find('a', {'class': re.compile(r'\bl\b')}) title_a = result.find('a') @@ -252,7 +593,7 @@ def _extract_title_url(self, result): url = title_a['href'] match = re.match(r'/url\?q=(http[^&]+)&', url) if match: - url = urllib.unquote(match.group(1)) + url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): @@ -285,14 +626,537 @@ def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: - return unichr(name2codepoint[entity]) + return chr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return chr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + +class GoogleImageSearch(object): + SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s" + NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&start=%(start)d" + SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d" + NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug, timeout=timeout) + self.results_info = None + self.eor = False # end of results + self._page = 0 + self._first_indexed_in_previous = None + self._filetype = None + self._last_search_url = None + self._results_per_page = 10 + self._last_from = 0 + self._lang = lang + self._tld = tld + + if re_search_strings: + self._re_search_strings = re_search_strings + elif lang == "de": + self._re_search_strings = ("Ergebnisse", "von", "ungefähr") + elif lang == "es": + self._re_search_strings = ("Resultados", "de", "aproximadamente") + # add more localised versions here + else: + self._re_search_strings = ("Results", "of", "about") + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _get_first_indexed_in_previous(self): + return self._first_indexed_in_previous + + def _set_first_indexed_in_previous(self, interval): + if interval == "day": + self._first_indexed_in_previous = 'd' + elif interval == "week": + self._first_indexed_in_previous = 'w' + elif interval == "month": + self._first_indexed_in_previous = 'm' + elif interval == "year": + self._first_indexed_in_previous = 'y' + else: + # a floating point value is a number of months + try: + num = float(interval) + except ValueError: + raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) + self._first_indexed_in_previous = 'm' + str(interval) + + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") + + def _get_filetype(self): + return self._filetype + + def _set_filetype(self, filetype): + self._filetype = filetype + + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + MAX_VALUE = 1000000 + page = self._get_results_page() + results = self._extract_results(page) + search_info = {'from': self.results_per_page*self._page, + 'to': self.results_per_page*self._page + len(results), + 'total': MAX_VALUE} + if not self.results_info: + self.results_info = search_info + if self.num_results == 0: + self.eor = True + return [] + if not results: + self.eor = True + return [] + if self._page > 0 and search_info['from'] == self._last_from: + self.eor = True + return [] + if search_info['to'] == search_info['total']: + self.eor = True + self._page += 1 + self._last_from = search_info['from'] + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = GoogleImageSearch.SEARCH_URL_0 + else: + url = GoogleImageSearch.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = GoogleImageSearch.NEXT_PAGE_0 + else: + url = GoogleImageSearch.NEXT_PAGE_1 + + safe_url = [url % { 'query': urllib.parse.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page, + 'tld' : self._tld, + 'lang' : self._lang }] + + # possibly extend url with optional properties + if self._first_indexed_in_previous: + safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) + if self._filetype: + safe_url.extend(["&as_filetype=", self._filetype]) + + safe_url = "".join(safe_url) + self._last_search_url = safe_url + + try: + page = self.browser.get_page(safe_url) + except BrowserError as e: + raise SearchError("Failed getting %s: %s" % (e.url, e.error)) + + return BeautifulSoup(page) + + def _extract_info(self, soup): + empty_info = {'from': 0, 'to': 0, 'total': 0} + div_ssb = soup.find('div', id='ssb') + if not div_ssb: + self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) + return empty_info + p = div_ssb.find('p') + if not p: + self._maybe_raise(ParseError, """

tag within

was not found on Google search page""", soup) + return empty_info + txt = ''.join(p.findAll(text=True)) + txt = txt.replace(',', '') + matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) + if not matches: + return empty_info + return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} + + def _extract_results(self, soup): + # Should extract + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return chr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return chr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + + +class GoogleFaceImageSearch(object): + SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s" + NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&start=%(start)d" + SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d" + NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" + + def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug, timeout=timeout) + self.results_info = None + self.eor = False # end of results + self._page = 0 + self._first_indexed_in_previous = None + self._filetype = None + self._last_search_url = None + self._results_per_page = 10 + self._last_from = 0 + self._lang = lang + self._tld = tld + + if re_search_strings: + self._re_search_strings = re_search_strings + elif lang == "de": + self._re_search_strings = ("Ergebnisse", "von", "ungefähr") + elif lang == "es": + self._re_search_strings = ("Resultados", "de", "aproximadamente") + elif lang == "fr": + self._re_search_strings = ("résultats", "de", "Environ") + # add more localised versions here + else: + self._re_search_strings = ("Results", "of", "about") + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + @property + def last_search_url(self): + return self._last_search_url + + def _get_page(self): + return self._page + + def _set_page(self, page): + self._page = page + + page = property(_get_page, _set_page) + + def _get_first_indexed_in_previous(self): + return self._first_indexed_in_previous + + def _set_first_indexed_in_previous(self, interval): + if interval == "day": + self._first_indexed_in_previous = 'd' + elif interval == "week": + self._first_indexed_in_previous = 'w' + elif interval == "month": + self._first_indexed_in_previous = 'm' + elif interval == "year": + self._first_indexed_in_previous = 'y' + else: + # a floating point value is a number of months + try: + num = float(interval) + except ValueError: + raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) + self._first_indexed_in_previous = 'm' + str(interval) + + first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") + + def _get_filetype(self): + return self._filetype + + def _set_filetype(self, filetype): + self._filetype = filetype + + filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + """ Gets a page of results """ + if self.eor: + return [] + MAX_VALUE = 1000000 + page = self._get_results_page() + results = self._extract_results(page) + search_info = {'from': self.results_per_page*self._page, + 'to': self.results_per_page*self._page + len(results), + 'total': MAX_VALUE} + if not self.results_info: + self.results_info = search_info + if self.num_results == 0: + self.eor = True + return [] + if not results: + self.eor = True + return [] + if self._page > 0 and search_info['from'] == self._last_from: + self.eor = True + return [] + if search_info['to'] == search_info['total']: + self.eor = True + self._page += 1 + self._last_from = search_info['from'] + return results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = GoogleFaceImageSearch.SEARCH_URL_0 + else: + url = GoogleFaceImageSearch.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = GoogleFaceImageSearch.NEXT_PAGE_0 + else: + url = GoogleFaceImageSearch.NEXT_PAGE_1 + + safe_url = [url % { 'query': urllib.parse.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page, + 'tld' : self._tld, + 'lang' : self._lang }] + + # possibly extend url with optional properties + if self._first_indexed_in_previous: + safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) + if self._filetype: + safe_url.extend(["&as_filetype=", self._filetype]) + + safe_url = "".join(safe_url) + self._last_search_url = safe_url + + try: + page = self.browser.get_page(safe_url) + except BrowserError as e: + raise SearchError("Failed getting %s: %s" % (e.url, e.error)) + return BeautifulSoup(page) + + def _extract_info(self, soup): + empty_info = {'from': 0, 'to': 0, 'total': 0} + div_ssb = soup.find('div', id='resultStats') + if not div_ssb: + self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) + return empty_info + #p = div_ssb.find('p') + p = div_ssb + if not p: + self._maybe_raise(ParseError, """

tag within

was not found on Google search page""", soup) + return empty_info + txt = ''.join(p.findAll(text=True)) + txt = txt.replace(',', '') + txt = txt.replace(' ', '') + #matches = re.search(r'(\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) + #matches = re.search(r'(\d+) %s' % self._re_search_strings[0], txt, re.U|re.I) + matches = re.search(r'(\d+)', txt, re.U) + + if not matches: + print(self._re_search_strings[0]) + print(txt) + return empty_info + return {'from': 0, 'to': 0, 'total': int(matches.group(1))} + + def _extract_results(self, soup): + # Should extract + + desc = ''.join(s for s in desc_strs if s) + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return chr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: - return unichr(cp) + return chr(cp) else: return m.group(0) @@ -322,7 +1186,8 @@ def _extract_info(self, soup): return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} def _extract_results(self, soup): - results = soup.findAll('p', {'class': 'g'}) + #results = soup.findAll('p', {'class': 'g'}) + results = soup.findAll('li','g') ret_res = [] for result in results: eres = self._extract_result(result) @@ -348,11 +1213,12 @@ def _extract_title_url(self, result): url = title_a['href'] match = re.match(r'/url\?q=(http[^&]+)&', url) if match: - url = urllib.unquote(match.group(1)) + url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): - desc_td = result.findNext('td') + #desc_td = result.findNext('td') + desc_div = result.find('span', 'st') if not desc_td: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None @@ -376,4 +1242,3 @@ def looper(tag): desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc) - diff --git a/xgoogle/sponsoredlinks.py b/xgoogle/sponsoredlinks.py index 12febfe..775dbb8 100755 --- a/xgoogle/sponsoredlinks.py +++ b/xgoogle/sponsoredlinks.py @@ -1,235 +1,239 @@ -#!/usr/bin/python -# -# Peteris Krumins (peter@catonmat.net) -# http://www.catonmat.net -- good coders code, great reuse -# -# http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/ -# -# Code is licensed under MIT license. -# - -import re -import urllib -import random -from htmlentitydefs import name2codepoint -from BeautifulSoup import BeautifulSoup - -from browser import Browser, BrowserError - -# -# TODO: join GoogleSearch and SponsoredLinks classes under a single base class -# - -class SLError(Exception): - """ Sponsored Links Error """ - pass - -class SLParseError(Exception): - """ - Parse error in Google results. - self.msg attribute contains explanation why parsing failed - self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse - Thrown only in debug mode - """ - - def __init__(self, msg, tag): - self.msg = msg - self.tag = tag - - def __str__(self): - return self.msg - - def html(self): - return self.tag.prettify() - -GET_ALL_SLEEP_FUNCTION = object() - -class SponsoredLink(object): - """ a single sponsored link """ - def __init__(self, title, url, display_url, desc): - self.title = title - self.url = url - self.display_url = display_url - self.desc = desc - -class SponsoredLinks(object): - SEARCH_URL_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&btnG=Search+Sponsored+Links&hl=en" - NEXT_PAGE_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&sa=N&start=%(start)d&hl=en" - SEARCH_URL_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&btnG=Search+Sponsored+Links&hl=en" - NEXT_PAGE_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&sa=N&start=%(start)d&hl=en" - - def __init__(self, query, random_agent=False, debug=False): - self.query = query - self.debug = debug - self.browser = Browser(debug=debug) - self._page = 0 - self.eor = False - self.results_info = None - self._results_per_page = 10 - - if random_agent: - self.browser.set_random_user_agent() - - @property - def num_results(self): - if not self.results_info: - page = self._get_results_page() - self.results_info = self._extract_info(page) - if self.results_info['total'] == 0: - self.eor = True - return self.results_info['total'] - - def _get_results_per_page(self): - return self._results_per_page - - def _set_results_par_page(self, rpp): - self._results_per_page = rpp - - results_per_page = property(_get_results_per_page, _set_results_par_page) - - def get_results(self): - if self.eor: - return [] - page = self._get_results_page() - info = self._extract_info(page) - if self.results_info is None: - self.results_info = info - if info['to'] == info['total']: - self.eor = True - results = self._extract_results(page) - if not results: - self.eor = True - return [] - self._page += 1 - return results - - def _get_all_results_sleep_fn(self): - return random.random()*5 + 1 # sleep from 1 - 6 seconds - - def get_all_results(self, sleep_function=None): - if sleep_function is GET_ALL_SLEEP_FUNCTION: - sleep_function = self._get_all_results_sleep_fn - if sleep_function is None: - sleep_function = lambda: None - ret_results = [] - while True: - res = self.get_results() - if not res: - return ret_results - ret_results.extend(res) - return ret_results - - def _maybe_raise(self, cls, *arg): - if self.debug: - raise cls(*arg) - - def _extract_info(self, soup): - empty_info = { 'from': 0, 'to': 0, 'total': 0 } - stats_span = soup.find('span', id='stats') - if not stats_span: - return empty_info - txt = ''.join(stats_span.findAll(text=True)) - txt = txt.replace(',', '').replace(" ", ' ') - matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt) - if not matches: - return empty_info - return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} - - def _get_results_page(self): - if self._page == 0: - if self._results_per_page == 10: - url = SponsoredLinks.SEARCH_URL_0 - else: - url = SponsoredLinks.SEARCH_URL_1 - else: - if self._results_per_page == 10: - url = SponsoredLinks.NEXT_PAGE_0 - else: - url = SponsoredLinks.NEXT_PAGE_1 - - safe_url = url % { 'query': urllib.quote_plus(self.query), - 'start': self._page * self._results_per_page, - 'num': self._results_per_page } - - try: - page = self.browser.get_page(safe_url) - except BrowserError, e: - raise SLError, "Failed getting %s: %s" % (e.url, e.error) - - return BeautifulSoup(page) - - def _extract_results(self, soup): - results = soup.findAll('div', {'class': 'g'}) - ret_res = [] - for result in results: - eres = self._extract_result(result) - if eres: - ret_res.append(eres) - return ret_res - - def _extract_result(self, result): - title, url = self._extract_title_url(result) - display_url = self._extract_display_url(result) # Warning: removes 'cite' from the result - desc = self._extract_description(result) - if not title or not url or not display_url or not desc: - return None - return SponsoredLink(title, url, display_url, desc) - - def _extract_title_url(self, result): - title_a = result.find('a') - if not title_a: - self._maybe_raise(SLParseError, "Title tag in sponsored link was not found", result) - return None, None - title = ''.join(title_a.findAll(text=True)) - title = self._html_unescape(title) - url = title_a['href'] - match = re.search(r'q=(http[^&]+)&', url) - if not match: - self._maybe_raise(SLParseError, "URL inside a sponsored link was not found", result) - return None, None - url = urllib.unquote(match.group(1)) - return title, url - - def _extract_display_url(self, result): - cite = result.find('cite') - if not cite: - self._maybe_raise(SLParseError, " not found inside result", result) - return None - - return ''.join(cite.findAll(text=True)) - - def _extract_description(self, result): - cite = result.find('cite') - if not cite: - return None - cite.extract() - - desc_div = result.find('div', {'class': 'line23'}) - if not desc_div: - self._maybe_raise(ParseError, "Description tag not found in sponsored link", result) - return None - - desc_strs = desc_div.findAll(text=True)[0:-1] - desc = ''.join(desc_strs) - desc = desc.replace("\n", " ") - desc = desc.replace(" ", " ") - return self._html_unescape(desc) - - def _html_unescape(self, str): - def entity_replacer(m): - entity = m.group(1) - if entity in name2codepoint: - return unichr(name2codepoint[entity]) - else: - return m.group(0) - - def ascii_replacer(m): - cp = int(m.group(1)) - if cp <= 255: - return unichr(cp) - else: - return m.group(0) - - s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) - return re.sub(r'&([^;]+);', entity_replacer, s, re.U) - +#!/usr/bin/python +# +# Peteris Krumins (peter@catonmat.net) +# http://www.catonmat.net -- good coders code, great reuse +# +# http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/ +# +# Code is licensed under MIT license. +# + +import re +import urllib +import random +from BeautifulSoup import BeautifulSoup + +try: + from html.entities import name2codepoint +except ImportError: + from htmlentitydefs import name2codepoint + +from browser import Browser, BrowserError + +# +# TODO: join GoogleSearch and SponsoredLinks classes under a single base class +# + +class SLError(Exception): + """ Sponsored Links Error """ + pass + +class SLParseError(Exception): + """ + Parse error in Google results. + self.msg attribute contains explanation why parsing failed + self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse + Thrown only in debug mode + """ + + def __init__(self, msg, tag): + self.msg = msg + self.tag = tag + + def __str__(self): + return self.msg + + def html(self): + return self.tag.prettify() + +GET_ALL_SLEEP_FUNCTION = object() + +class SponsoredLink(object): + """ a single sponsored link """ + def __init__(self, title, url, display_url, desc): + self.title = title + self.url = url + self.display_url = display_url + self.desc = desc + +class SponsoredLinks(object): + SEARCH_URL_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&btnG=Search+Sponsored+Links&hl=en" + NEXT_PAGE_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&sa=N&start=%(start)d&hl=en" + SEARCH_URL_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&btnG=Search+Sponsored+Links&hl=en" + NEXT_PAGE_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&sa=N&start=%(start)d&hl=en" + + def __init__(self, query, random_agent=False, debug=False): + self.query = query + self.debug = debug + self.browser = Browser(debug=debug) + self._page = 0 + self.eor = False + self.results_info = None + self._results_per_page = 10 + + if random_agent: + self.browser.set_random_user_agent() + + @property + def num_results(self): + if not self.results_info: + page = self._get_results_page() + self.results_info = self._extract_info(page) + if self.results_info['total'] == 0: + self.eor = True + return self.results_info['total'] + + def _get_results_per_page(self): + return self._results_per_page + + def _set_results_par_page(self, rpp): + self._results_per_page = rpp + + results_per_page = property(_get_results_per_page, _set_results_par_page) + + def get_results(self): + if self.eor: + return [] + page = self._get_results_page() + info = self._extract_info(page) + if self.results_info is None: + self.results_info = info + if info['to'] == info['total']: + self.eor = True + results = self._extract_results(page) + if not results: + self.eor = True + return [] + self._page += 1 + return results + + def _get_all_results_sleep_fn(self): + return random.random()*5 + 1 # sleep from 1 - 6 seconds + + def get_all_results(self, sleep_function=None): + if sleep_function is GET_ALL_SLEEP_FUNCTION: + sleep_function = self._get_all_results_sleep_fn + if sleep_function is None: + sleep_function = lambda: None + ret_results = [] + while True: + res = self.get_results() + if not res: + return ret_results + ret_results.extend(res) + return ret_results + + def _maybe_raise(self, cls, *arg): + if self.debug: + raise cls(*arg) + + def _extract_info(self, soup): + empty_info = { 'from': 0, 'to': 0, 'total': 0 } + stats_span = soup.find('span', id='stats') + if not stats_span: + return empty_info + txt = ''.join(stats_span.findAll(text=True)) + txt = txt.replace(',', '').replace(" ", ' ') + matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt) + if not matches: + return empty_info + return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} + + def _get_results_page(self): + if self._page == 0: + if self._results_per_page == 10: + url = SponsoredLinks.SEARCH_URL_0 + else: + url = SponsoredLinks.SEARCH_URL_1 + else: + if self._results_per_page == 10: + url = SponsoredLinks.NEXT_PAGE_0 + else: + url = SponsoredLinks.NEXT_PAGE_1 + + safe_url = url % { 'query': urllib.quote_plus(self.query), + 'start': self._page * self._results_per_page, + 'num': self._results_per_page } + + try: + page = self.browser.get_page(safe_url) + except BrowserError(e): + raise SLError("Failed getting %s: %s" % (e.url, e.error)) + + return BeautifulSoup(page) + + def _extract_results(self, soup): + results = soup.findAll('div', {'class': 'g'}) + ret_res = [] + for result in results: + eres = self._extract_result(result) + if eres: + ret_res.append(eres) + return ret_res + + def _extract_result(self, result): + title, url = self._extract_title_url(result) + display_url = self._extract_display_url(result) # Warning: removes 'cite' from the result + desc = self._extract_description(result) + if not title or not url or not display_url or not desc: + return None + return SponsoredLink(title, url, display_url, desc) + + def _extract_title_url(self, result): + title_a = result.find('a') + if not title_a: + self._maybe_raise(SLParseError, "Title tag in sponsored link was not found", result) + return None, None + title = ''.join(title_a.findAll(text=True)) + title = self._html_unescape(title) + url = title_a['href'] + match = re.search(r'q=(http[^&]+)&', url) + if not match: + self._maybe_raise(SLParseError, "URL inside a sponsored link was not found", result) + return None, None + url = urllib.unquote(match.group(1)) + return title, url + + def _extract_display_url(self, result): + cite = result.find('cite') + if not cite: + self._maybe_raise(SLParseError, " not found inside result", result) + return None + + return ''.join(cite.findAll(text=True)) + + def _extract_description(self, result): + cite = result.find('cite') + if not cite: + return None + cite.extract() + + desc_div = result.find('div', {'class': 'line23'}) + if not desc_div: + self._maybe_raise(ParseError, "Description tag not found in sponsored link", result) + return None + + desc_strs = desc_div.findAll(text=True)[0:-1] + desc = ''.join(desc_strs) + desc = desc.replace("\n", " ") + desc = desc.replace(" ", " ") + return self._html_unescape(desc) + + def _html_unescape(self, str): + def entity_replacer(m): + entity = m.group(1) + if entity in name2codepoint: + return unichr(name2codepoint[entity]) + else: + return m.group(0) + + def ascii_replacer(m): + cp = int(m.group(1)) + if cp <= 255: + return unichr(cp) + else: + return m.group(0) + + s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) + return re.sub(r'&([^;]+);', entity_replacer, s, re.U) + diff --git a/xgoogle/translate.py b/xgoogle/translate.py index b705072..cd09f1e 100755 --- a/xgoogle/translate.py +++ b/xgoogle/translate.py @@ -1,202 +1,202 @@ -#!/usr/bin/python -# -# Peteris Krumins (peter@catonmat.net) -# http://www.catonmat.net -- good coders code, great reuse -# -# http://www.catonmat.net/blog/python-library-for-google-translate/ -# -# Code is licensed under MIT license. -# - -from browser import Browser, BrowserError -from urllib import quote_plus - -try: import json -except: import simplejson as json - - -class TranslationError(Exception): - pass - -class Translator(object): - translate_url = "http://ajax.googleapis.com/ajax/services/language/translate?v=1.0&q=%(message)s&langpair=%(from)s%%7C%(to)s" - - def __init__(self): - self.browser = Browser() - - def translate(self, message, lang_to='en', lang_from=''): - """ - Given a 'message' translate it from 'lang_from' to 'lang_to'. - If 'lang_from' is empty, auto-detects the language. - Returns the translated message. - """ - - if lang_to not in _languages: - raise TranslationError, "Language %s is not supported as lang_to." % lang_to - if lang_from not in _languages and lang_from != '': - raise TranslationError, "Language %s is not supported as lang_from." % lang_from - - message = quote_plus(message) - real_url = Translator.translate_url % { 'message': message, - 'from': lang_from, - 'to': lang_to } - - try: - translation = self.browser.get_page(real_url) - data = json.loads(translation) - - if data['responseStatus'] != 200: - raise TranslationError, "Failed translating: %s" % data['responseDetails'] - - return data['responseData']['translatedText'] - except BrowserError, e: - raise TranslationError, "Failed translating (getting %s failed): %s" % (e.url, e.error) - except ValueError, e: - raise TranslationError, "Failed translating (json failed): %s" % e.message - except KeyError, e: - raise TranslationError, "Failed translating, response didn't contain the translation" - - return None - -class DetectionError(Exception): - pass - -class Language(object): - def __init__(self, lang, confidence, is_reliable): - self.lang_code = lang - self.lang = _languages[lang] - self.confidence = confidence - self.is_reliable = is_reliable - - def __repr__(self): - return '' % (self.lang_code, self.lang) - -class LanguageDetector(object): - detect_url = "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q=%(message)s" - - def __init__(self): - self.browser = Browser() - - def detect(self, message): - """ - Given a 'message' detects its language. - Returns Language object. - """ - - message = quote_plus(message) - real_url = LanguageDetector.detect_url % { 'message': message } - - try: - detection = self.browser.get_page(real_url) - data = json.loads(detection) - - if data['responseStatus'] != 200: - raise DetectionError, "Failed detecting language: %s" % data['responseDetails'] - - rd = data['responseData'] - return Language(rd['language'], rd['confidence'], rd['isReliable']) - - except BrowserError, e: - raise DetectionError, "Failed detecting language (getting %s failed): %s" % (e.url, e.error) - except ValueError, e: - raise DetectionErrro, "Failed detecting language (json failed): %s" % e.message - except KeyError, e: - raise DetectionError, "Failed detecting language, response didn't contain the necessary data" - - return None - - -_languages = { - 'af': 'Afrikaans', - 'sq': 'Albanian', - 'am': 'Amharic', - 'ar': 'Arabic', - 'hy': 'Armenian', - 'az': 'Azerbaijani', - 'eu': 'Basque', - 'be': 'Belarusian', - 'bn': 'Bengali', - 'bh': 'Bihari', - 'bg': 'Bulgarian', - 'my': 'Burmese', - 'ca': 'Catalan', - 'chr': 'Cherokee', - 'zh': 'Chinese', - 'zh-CN': 'Chinese_simplified', - 'zh-TW': 'Chinese_traditional', - 'hr': 'Croatian', - 'cs': 'Czech', - 'da': 'Danish', - 'dv': 'Dhivehi', - 'nl': 'Dutch', - 'en': 'English', - 'eo': 'Esperanto', - 'et': 'Estonian', - 'tl': 'Filipino', - 'fi': 'Finnish', - 'fr': 'French', - 'gl': 'Galician', - 'ka': 'Georgian', - 'de': 'German', - 'el': 'Greek', - 'gn': 'Guarani', - 'gu': 'Gujarati', - 'iw': 'Hebrew', - 'hi': 'Hindi', - 'hu': 'Hungarian', - 'is': 'Icelandic', - 'id': 'Indonesian', - 'iu': 'Inuktitut', - 'ga': 'Irish', - 'it': 'Italian', - 'ja': 'Japanese', - 'kn': 'Kannada', - 'kk': 'Kazakh', - 'km': 'Khmer', - 'ko': 'Korean', - 'ku': 'Kurdish', - 'ky': 'Kyrgyz', - 'lo': 'Laothian', - 'lv': 'Latvian', - 'lt': 'Lithuanian', - 'mk': 'Macedonian', - 'ms': 'Malay', - 'ml': 'Malayalam', - 'mt': 'Maltese', - 'mr': 'Marathi', - 'mn': 'Mongolian', - 'ne': 'Nepali', - 'no': 'Norwegian', - 'or': 'Oriya', - 'ps': 'Pashto', - 'fa': 'Persian', - 'pl': 'Polish', - 'pt-PT': 'Portuguese', - 'pa': 'Punjabi', - 'ro': 'Romanian', - 'ru': 'Russian', - 'sa': 'Sanskrit', - 'sr': 'Serbian', - 'sd': 'Sindhi', - 'si': 'Sinhalese', - 'sk': 'Slovak', - 'sl': 'Slovenian', - 'es': 'Spanish', - 'sw': 'Swahili', - 'sv': 'Swedish', - 'tg': 'Tajik', - 'ta': 'Tamil', - 'tl': 'Tagalog', - 'te': 'Telugu', - 'th': 'Thai', - 'bo': 'Tibetan', - 'tr': 'Turkish', - 'uk': 'Ukrainian', - 'ur': 'Urdu', - 'uz': 'Uzbek', - 'ug': 'Uighur', - 'vi': 'Vietnamese', - 'cy': 'Welsh', - 'yi': 'Yiddish' -}; - +#!/usr/bin/python +# +# Peteris Krumins (peter@catonmat.net) +# http://www.catonmat.net -- good coders code, great reuse +# +# http://www.catonmat.net/blog/python-library-for-google-translate/ +# +# Code is licensed under MIT license. +# + +from browser import Browser, BrowserError +from urllib import quote_plus + +try: import json +except: import simplejson as json + + +class TranslationError(Exception): + pass + +class Translator(object): + translate_url = "http://ajax.googleapis.com/ajax/services/language/translate?v=1.0&q=%(message)s&langpair=%(from)s%%7C%(to)s" + + def __init__(self): + self.browser = Browser() + + def translate(self, message, lang_to='en', lang_from=''): + """ + Given a 'message' translate it from 'lang_from' to 'lang_to'. + If 'lang_from' is empty, auto-detects the language. + Returns the translated message. + """ + + if lang_to not in _languages: + raise TranslationError("Language %s is not supported as lang_to." % lang_to) + if lang_from not in _languages and lang_from != '': + raise TranslationError("Language %s is not supported as lang_from." % lang_from) + + message = quote_plus(message) + real_url = Translator.translate_url % { 'message': message, + 'from': lang_from, + 'to': lang_to } + + try: + translation = self.browser.get_page(real_url) + data = json.loads(translation) + + if data['responseStatus'] != 200: + raise TranslationError("Failed translating: %s" % data['responseDetails']) + + return data['responseData']['translatedText'] + except BrowserError(e): + raise TranslationError("Failed translating (getting %s failed): %s" % (e.url, e.error)) + except ValueError(e): + raise TranslationError("Failed translating (json failed): %s" % e.message) + except KeyError(e): + raise TranslationError("Failed translating, response didn't contain the translation") + + return None + +class DetectionError(Exception): + pass + +class Language(object): + def __init__(self, lang, confidence, is_reliable): + self.lang_code = lang + self.lang = _languages[lang] + self.confidence = confidence + self.is_reliable = is_reliable + + def __repr__(self): + return '' % (self.lang_code, self.lang) + +class LanguageDetector(object): + detect_url = "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q=%(message)s" + + def __init__(self): + self.browser = Browser() + + def detect(self, message): + """ + Given a 'message' detects its language. + Returns Language object. + """ + + message = quote_plus(message) + real_url = LanguageDetector.detect_url % { 'message': message } + + try: + detection = self.browser.get_page(real_url) + data = json.loads(detection) + + if data['responseStatus'] != 200: + raise DetectionError("Failed detecting language: %s" % data['responseDetails']) + + rd = data['responseData'] + return Language(rd['language'], rd['confidence'], rd['isReliable']) + + except BrowserError(e): + raise DetectionError("Failed detecting language (getting %s failed): %s" % (e.url, e.error)) + except ValueError(e): + raise DetectionErrro("Failed detecting language (json failed): %s" % e.message) + except KeyError(e): + raise DetectionError("Failed detecting language, response didn't contain the necessary data") + + return None + + +_languages = { + 'af': 'Afrikaans', + 'sq': 'Albanian', + 'am': 'Amharic', + 'ar': 'Arabic', + 'hy': 'Armenian', + 'az': 'Azerbaijani', + 'eu': 'Basque', + 'be': 'Belarusian', + 'bn': 'Bengali', + 'bh': 'Bihari', + 'bg': 'Bulgarian', + 'my': 'Burmese', + 'ca': 'Catalan', + 'chr': 'Cherokee', + 'zh': 'Chinese', + 'zh-CN': 'Chinese_simplified', + 'zh-TW': 'Chinese_traditional', + 'hr': 'Croatian', + 'cs': 'Czech', + 'da': 'Danish', + 'dv': 'Dhivehi', + 'nl': 'Dutch', + 'en': 'English', + 'eo': 'Esperanto', + 'et': 'Estonian', + 'tl': 'Filipino', + 'fi': 'Finnish', + 'fr': 'French', + 'gl': 'Galician', + 'ka': 'Georgian', + 'de': 'German', + 'el': 'Greek', + 'gn': 'Guarani', + 'gu': 'Gujarati', + 'iw': 'Hebrew', + 'hi': 'Hindi', + 'hu': 'Hungarian', + 'is': 'Icelandic', + 'id': 'Indonesian', + 'iu': 'Inuktitut', + 'ga': 'Irish', + 'it': 'Italian', + 'ja': 'Japanese', + 'kn': 'Kannada', + 'kk': 'Kazakh', + 'km': 'Khmer', + 'ko': 'Korean', + 'ku': 'Kurdish', + 'ky': 'Kyrgyz', + 'lo': 'Laothian', + 'lv': 'Latvian', + 'lt': 'Lithuanian', + 'mk': 'Macedonian', + 'ms': 'Malay', + 'ml': 'Malayalam', + 'mt': 'Maltese', + 'mr': 'Marathi', + 'mn': 'Mongolian', + 'ne': 'Nepali', + 'no': 'Norwegian', + 'or': 'Oriya', + 'ps': 'Pashto', + 'fa': 'Persian', + 'pl': 'Polish', + 'pt-PT': 'Portuguese', + 'pa': 'Punjabi', + 'ro': 'Romanian', + 'ru': 'Russian', + 'sa': 'Sanskrit', + 'sr': 'Serbian', + 'sd': 'Sindhi', + 'si': 'Sinhalese', + 'sk': 'Slovak', + 'sl': 'Slovenian', + 'es': 'Spanish', + 'sw': 'Swahili', + 'sv': 'Swedish', + 'tg': 'Tajik', + 'ta': 'Tamil', + 'tl': 'Tagalog', + 'te': 'Telugu', + 'th': 'Thai', + 'bo': 'Tibetan', + 'tr': 'Turkish', + 'uk': 'Ukrainian', + 'ur': 'Urdu', + 'uz': 'Uzbek', + 'ug': 'Uighur', + 'vi': 'Vietnamese', + 'cy': 'Welsh', + 'yi': 'Yiddish' +}; +