From cfdcb6b03b15638188cee8353664504c459e89fd Mon Sep 17 00:00:00 2001
From: "nikola.milosevic" <nikola.milosevic@prelovac.com>
Date: Mon, 14 Jul 2014 13:06:48 +0100
Subject: [PATCH 1/9] Added class for face image search

---
 examples/ImageExample.py |  17 +++
 xgoogle/search.py        | 265 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 282 insertions(+)
 create mode 100644 examples/ImageExample.py

diff --git a/examples/ImageExample.py b/examples/ImageExample.py
new file mode 100644
index 0000000..d625649
--- /dev/null
+++ b/examples/ImageExample.py
@@ -0,0 +1,17 @@
+#
+# This program does a Google search for face images for "quick and dirty" and returns
+# 50 results.
+#
+
+from xgoogle.search import GoogleFaceImageSearch, SearchError
+try:
+    gs = GoogleFaceImageSearch("quick and dirty")
+    gs.results_per_page = 50
+    results = gs.get_results()
+    for res in results:
+        print res.trumb.encode('utf8')
+        print res.url.encode('utf8')
+        print
+except SearchError, e:
+    print "Search failed: %s" % e
+
diff --git a/xgoogle/search.py b/xgoogle/search.py
index 98b681e..a4b381a 100755
--- a/xgoogle/search.py
+++ b/xgoogle/search.py
@@ -3,6 +3,8 @@
 #
 # Peteris Krumins (peter@catonmat.net)
 # http://www.catonmat.net  --  good coders code, great reuse
+# Updated by Nikola Milosevic (nikola.milosevic@inspiratron.org
+# http://www.inspiratron.org
 #
 # http://www.catonmat.net/blog/python-library-for-google-search/
 #
@@ -39,6 +41,14 @@ def __str__(self):
 
     def html(self):
         return self.tag.prettify()
+    
+class FaceImageSearchResult:
+    def __init__(self, trumb, url):
+        self.url = url
+        self.trumb = trumb
+
+    def __str__(self):
+        return 'Google Search Result: "%s"' % self.title
 
 class SearchResult:
     def __init__(self, title, url, desc):
@@ -298,6 +308,261 @@ def ascii_replacer(m):
 
         s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
         return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
+    
+
+class GoogleFaceImageSearch(object):
+    SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s"
+    NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&start=%(start)d"
+    SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d"
+    NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"
+
+    def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
+        self.query = query
+        self.debug = debug
+        self.browser = Browser(debug=debug)
+        self.results_info = None
+        self.eor = False # end of results
+        self._page = 0
+        self._first_indexed_in_previous = None
+        self._filetype = None
+        self._last_search_url = None
+        self._results_per_page = 10
+        self._last_from = 0
+        self._lang = lang
+        self._tld = tld
+        
+        if re_search_strings:
+            self._re_search_strings = re_search_strings
+        elif lang == "de":
+            self._re_search_strings = ("Ergebnisse", "von", u"ungefähr")
+        elif lang == "es":
+            self._re_search_strings = ("Resultados", "de", "aproximadamente")
+        # add more localised versions here
+        else:
+            self._re_search_strings = ("Results", "of", "about")
+
+        if random_agent:
+            self.browser.set_random_user_agent()
+
+    @property
+    def num_results(self):
+        if not self.results_info:
+            page = self._get_results_page()
+            self.results_info = self._extract_info(page)
+            if self.results_info['total'] == 0:
+                self.eor = True
+        return self.results_info['total']
+
+    @property
+    def last_search_url(self):
+        return self._last_search_url
+
+    def _get_page(self):
+        return self._page
+
+    def _set_page(self, page):
+        self._page = page
+
+    page = property(_get_page, _set_page)
+
+    def _get_first_indexed_in_previous(self):
+        return self._first_indexed_in_previous
+
+    def _set_first_indexed_in_previous(self, interval):
+        if interval == "day":
+            self._first_indexed_in_previous = 'd'
+        elif interval == "week":
+            self._first_indexed_in_previous = 'w'
+        elif interval == "month":
+            self._first_indexed_in_previous = 'm'
+        elif interval == "year":
+            self._first_indexed_in_previous = 'y'
+        else:
+            # a floating point value is a number of months
+            try:
+                num = float(interval)
+            except ValueError:
+                raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
+            self._first_indexed_in_previous = 'm' + str(interval)
+    
+    first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
+    
+    def _get_filetype(self):
+        return self._filetype
+
+    def _set_filetype(self, filetype):
+        self._filetype = filetype
+    
+    filetype = property(_get_filetype, _set_filetype, doc="file extension to search for")
+    
+    def _get_results_per_page(self):
+        return self._results_per_page
+
+    def _set_results_par_page(self, rpp):
+        self._results_per_page = rpp
+
+    results_per_page = property(_get_results_per_page, _set_results_par_page)
+
+    def get_results(self):
+        """ Gets a page of results """
+        if self.eor:
+            return []
+        MAX_VALUE = 1000000
+        page = self._get_results_page()
+        results = self._extract_results(page)
+        search_info = {'from': self.results_per_page*self._page,
+                       'to': self.results_per_page*self._page + len(results),
+                       'total': MAX_VALUE}
+        if not self.results_info:
+            self.results_info = search_info
+            if self.num_results == 0:
+                self.eor = True
+                return []
+        if not results:
+            self.eor = True
+            return []
+        if self._page > 0 and search_info['from'] == self._last_from:
+            self.eor = True
+            return []
+        if search_info['to'] == search_info['total']:
+            self.eor = True
+        self._page += 1
+        self._last_from = search_info['from']
+        return results
+
+    def _maybe_raise(self, cls, *arg):
+        if self.debug:
+            raise cls(*arg)
+
+    def _get_results_page(self):
+        if self._page == 0:
+            if self._results_per_page == 10:
+                url = GoogleFaceImageSearch.SEARCH_URL_0
+            else:
+                url = GoogleFaceImageSearch.SEARCH_URL_1
+        else:
+            if self._results_per_page == 10:
+                url = GoogleFaceImageSearch.NEXT_PAGE_0
+            else:
+                url = GoogleFaceImageSearch.NEXT_PAGE_1
+
+        safe_url = [url % { 'query': urllib.quote_plus(self.query),
+                           'start': self._page * self._results_per_page,
+                           'num': self._results_per_page,
+                           'tld' : self._tld,
+                           'lang' : self._lang }]
+        
+        # possibly extend url with optional properties
+        if self._first_indexed_in_previous:
+            safe_url.extend(["&as_qdr=", self._first_indexed_in_previous])
+        if self._filetype:
+            safe_url.extend(["&as_filetype=", self._filetype])
+        
+        safe_url = "".join(safe_url)
+        self._last_search_url = safe_url
+        
+        try:
+            page = self.browser.get_page(safe_url)
+        except BrowserError, e:
+            raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
+
+        return BeautifulSoup(page)
+
+    def _extract_info(self, soup):
+        empty_info = {'from': 0, 'to': 0, 'total': 0}
+        div_ssb = soup.find('div', id='ssb')
+        if not div_ssb:
+            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
+            return empty_info
+        p = div_ssb.find('p')
+        if not p:
+            self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
+            return empty_info
+        txt = ''.join(p.findAll(text=True))
+        txt = txt.replace(',', '')
+        matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
+        if not matches:
+            return empty_info
+        return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
+
+    def _extract_results(self, soup):
+        # Should extract <a href="/url?q=
+        results = soup.findAll('a', href=re.compile("^/url\?q="))
+        #results = soup.findAll('img')
+        ret_res = []
+        for result in results:
+            eres = self._extract_result(result)
+            if eres:
+                ret_res.append(eres)
+        return ret_res
+
+    def _extract_result(self, result):
+        imgsa = result.findAll('img')
+        if len(imgsa)==0:
+            return FaceImageSearchResult("","")
+        else:
+            imgs = imgsa[0]
+        trumnail = imgs['src']
+        image = result['href'][7:]
+        return FaceImageSearchResult(trumnail, image)
+
+    def _extract_title_url(self, result):
+        #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
+        title_a = result.find('a')
+        if not title_a:
+            self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
+            return None, None
+        title = ''.join(title_a.findAll(text=True))
+        title = self._html_unescape(title)
+        url = title_a['href']
+        match = re.match(r'/url\?q=(http[^&]+)&', url)
+        if match:
+            url = urllib.unquote(match.group(1))
+        return title, url
+
+    def _extract_description(self, result):
+        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
+        if not desc_div:
+            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
+            return None
+
+        desc_strs = []
+        def looper(tag):
+            if not tag: return
+            for t in tag:
+                try:
+                    if t.name == 'br': break
+                except AttributeError:
+                    pass
+
+                try:
+                    desc_strs.append(t.string)
+                except AttributeError:
+                    desc_strs.append(t)
+
+        looper(desc_div)
+        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
+
+        desc = ''.join(s for s in desc_strs if s)
+        return self._html_unescape(desc)
+
+    def _html_unescape(self, str):
+        def entity_replacer(m):
+            entity = m.group(1)
+            if entity in name2codepoint:
+                return unichr(name2codepoint[entity])
+            else:
+                return m.group(0)
+
+        def ascii_replacer(m):
+            cp = int(m.group(1))
+            if cp <= 255:
+                return unichr(cp)
+            else:
+                return m.group(0)
+
+        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
+        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
 
 class BlogSearch(GoogleSearch):
 

From 5327fc93e28cb625a624f1e996514897e0f65422 Mon Sep 17 00:00:00 2001
From: "nikola.milosevic" <nikola.milosevic@prelovac.com>
Date: Mon, 14 Jul 2014 13:11:02 +0100
Subject: [PATCH 2/9] Update of about files

---
 contributors.txt | 3 +++
 readme.txt       | 9 ++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/contributors.txt b/contributors.txt
index 53e917d..debc2ae 100755
--- a/contributors.txt
+++ b/contributors.txt
@@ -1,5 +1,8 @@
 This file contains people who have helped xgoogle project:
 
+	 * Nikola Milosevic
+      Thanks for adding:
+		  * Face image search
     * Holger Berndt
       Thanks for adding:
        * 'lang' and 'tld' arguments to Google Search
diff --git a/readme.txt b/readme.txt
index 3835b39..96d80f2 100755
--- a/readme.txt
+++ b/readme.txt
@@ -1,7 +1,10 @@
-This is a Google library called 'xgoogle'. Current version is 1.3.
+This is a fork of a Google library called 'xgoogle'. Current version is 1.3.1
 
-It's written by Peteris Krumins (peter@catonmat.net).
-His blog is at http://www.catonmat.net  --  good coders code, great reuse.
+It is forked by Nikola Milosevic (nikola.milosevic@inspiratron.org) from the original code that was written by Peteris Krumins (peter@catonmat.net).
+Peteris Krumins blog is at http://www.catonmat.net  --  good coders code, great reuse.
+Nikola Milosevic's blog is at http://www.inspiratron.org.
+
+This fork adds Google face image search, and hopefully in the future Google image search
 
 The code is licensed under MIT license.
 

From 1d76d7320abcebc8daa6e28307c9ea5cad5c62ff Mon Sep 17 00:00:00 2001
From: "nikola.milosevic" <nikola.milosevic@prelovac.com>
Date: Mon, 14 Jul 2014 13:12:35 +0100
Subject: [PATCH 3/9] Readme update

---
 readme.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/readme.txt b/readme.txt
index 96d80f2..f33887c 100755
--- a/readme.txt
+++ b/readme.txt
@@ -190,13 +190,11 @@ v1.1:  * added Google Sponsored Links Search.
 v1.2:  * added Google Sets module
 v1.3:  * added Google Translate module
        * fixed a bug in browser.py when KeyboardInterrupt did not get propagated.
+v1.3:  * added Google Face image search
 
 --------------------------------------------------------------------------
 
 That's it. Have fun! :)
 
 
-Sincerely,
-Peteris Krumins
-http://www.catonmat.net
 

From d3996208a81fa2a7f0bfee2d08d8cac81f79f78d Mon Sep 17 00:00:00 2001
From: "nikola.milosevic" <nikola.milosevic@prelovac.com>
Date: Mon, 14 Jul 2014 13:31:47 +0100
Subject: [PATCH 4/9] Added norman image search

---
 examples/ImageExample2.py |  17 +++
 readme.txt                |   4 +-
 xgoogle/search.py         | 255 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 274 insertions(+), 2 deletions(-)
 create mode 100644 examples/ImageExample2.py

diff --git a/examples/ImageExample2.py b/examples/ImageExample2.py
new file mode 100644
index 0000000..afe37bb
--- /dev/null
+++ b/examples/ImageExample2.py
@@ -0,0 +1,17 @@
+#
+# This program does a Google search for images for "quick and dirty" and returns
+# 50 results.
+#
+
+from xgoogle.search import GoogleImageSearch, SearchError
+try:
+    gs = GoogleImageSearch("quick and dirty")
+    gs.results_per_page = 50
+    results = gs.get_results()
+    for res in results:
+        print res.trumb.encode('utf8')
+        print res.url.encode('utf8')
+        print
+except SearchError, e:
+    print "Search failed: %s" % e
+
diff --git a/readme.txt b/readme.txt
index f33887c..6408af5 100755
--- a/readme.txt
+++ b/readme.txt
@@ -1,4 +1,4 @@
-This is a fork of a Google library called 'xgoogle'. Current version is 1.3.1
+This is a fork of a Google library called 'xgoogle'. Current version is 1.4
 
 It is forked by Nikola Milosevic (nikola.milosevic@inspiratron.org) from the original code that was written by Peteris Krumins (peter@catonmat.net).
 Peteris Krumins blog is at http://www.catonmat.net  --  good coders code, great reuse.
@@ -190,7 +190,7 @@ v1.1:  * added Google Sponsored Links Search.
 v1.2:  * added Google Sets module
 v1.3:  * added Google Translate module
        * fixed a bug in browser.py when KeyboardInterrupt did not get propagated.
-v1.3:  * added Google Face image search
+v1.4:  * added Google image and face image search
 
 --------------------------------------------------------------------------
 
diff --git a/xgoogle/search.py b/xgoogle/search.py
index a4b381a..7934afa 100755
--- a/xgoogle/search.py
+++ b/xgoogle/search.py
@@ -308,6 +308,260 @@ def ascii_replacer(m):
 
         s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
         return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
+
+class GoogleImageSearch(object):
+    SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s"
+    NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&start=%(start)d"
+    SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d"
+    NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"
+
+    def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
+        self.query = query
+        self.debug = debug
+        self.browser = Browser(debug=debug)
+        self.results_info = None
+        self.eor = False # end of results
+        self._page = 0
+        self._first_indexed_in_previous = None
+        self._filetype = None
+        self._last_search_url = None
+        self._results_per_page = 10
+        self._last_from = 0
+        self._lang = lang
+        self._tld = tld
+        
+        if re_search_strings:
+            self._re_search_strings = re_search_strings
+        elif lang == "de":
+            self._re_search_strings = ("Ergebnisse", "von", u"ungefähr")
+        elif lang == "es":
+            self._re_search_strings = ("Resultados", "de", "aproximadamente")
+        # add more localised versions here
+        else:
+            self._re_search_strings = ("Results", "of", "about")
+
+        if random_agent:
+            self.browser.set_random_user_agent()
+
+    @property
+    def num_results(self):
+        if not self.results_info:
+            page = self._get_results_page()
+            self.results_info = self._extract_info(page)
+            if self.results_info['total'] == 0:
+                self.eor = True
+        return self.results_info['total']
+
+    @property
+    def last_search_url(self):
+        return self._last_search_url
+
+    def _get_page(self):
+        return self._page
+
+    def _set_page(self, page):
+        self._page = page
+
+    page = property(_get_page, _set_page)
+
+    def _get_first_indexed_in_previous(self):
+        return self._first_indexed_in_previous
+
+    def _set_first_indexed_in_previous(self, interval):
+        if interval == "day":
+            self._first_indexed_in_previous = 'd'
+        elif interval == "week":
+            self._first_indexed_in_previous = 'w'
+        elif interval == "month":
+            self._first_indexed_in_previous = 'm'
+        elif interval == "year":
+            self._first_indexed_in_previous = 'y'
+        else:
+            # a floating point value is a number of months
+            try:
+                num = float(interval)
+            except ValueError:
+                raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
+            self._first_indexed_in_previous = 'm' + str(interval)
+    
+    first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
+    
+    def _get_filetype(self):
+        return self._filetype
+
+    def _set_filetype(self, filetype):
+        self._filetype = filetype
+    
+    filetype = property(_get_filetype, _set_filetype, doc="file extension to search for")
+    
+    def _get_results_per_page(self):
+        return self._results_per_page
+
+    def _set_results_par_page(self, rpp):
+        self._results_per_page = rpp
+
+    results_per_page = property(_get_results_per_page, _set_results_par_page)
+
+    def get_results(self):
+        """ Gets a page of results """
+        if self.eor:
+            return []
+        MAX_VALUE = 1000000
+        page = self._get_results_page()
+        results = self._extract_results(page)
+        search_info = {'from': self.results_per_page*self._page,
+                       'to': self.results_per_page*self._page + len(results),
+                       'total': MAX_VALUE}
+        if not self.results_info:
+            self.results_info = search_info
+            if self.num_results == 0:
+                self.eor = True
+                return []
+        if not results:
+            self.eor = True
+            return []
+        if self._page > 0 and search_info['from'] == self._last_from:
+            self.eor = True
+            return []
+        if search_info['to'] == search_info['total']:
+            self.eor = True
+        self._page += 1
+        self._last_from = search_info['from']
+        return results
+
+    def _maybe_raise(self, cls, *arg):
+        if self.debug:
+            raise cls(*arg)
+
+    def _get_results_page(self):
+        if self._page == 0:
+            if self._results_per_page == 10:
+                url = GoogleImageSearch.SEARCH_URL_0
+            else:
+                url = GoogleImageSearch.SEARCH_URL_1
+        else:
+            if self._results_per_page == 10:
+                url = GoogleImageSearch.NEXT_PAGE_0
+            else:
+                url = GoogleImageSearch.NEXT_PAGE_1
+
+        safe_url = [url % { 'query': urllib.quote_plus(self.query),
+                           'start': self._page * self._results_per_page,
+                           'num': self._results_per_page,
+                           'tld' : self._tld,
+                           'lang' : self._lang }]
+        
+        # possibly extend url with optional properties
+        if self._first_indexed_in_previous:
+            safe_url.extend(["&as_qdr=", self._first_indexed_in_previous])
+        if self._filetype:
+            safe_url.extend(["&as_filetype=", self._filetype])
+        
+        safe_url = "".join(safe_url)
+        self._last_search_url = safe_url
+        
+        try:
+            page = self.browser.get_page(safe_url)
+        except BrowserError, e:
+            raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
+
+        return BeautifulSoup(page)
+
+    def _extract_info(self, soup):
+        empty_info = {'from': 0, 'to': 0, 'total': 0}
+        div_ssb = soup.find('div', id='ssb')
+        if not div_ssb:
+            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
+            return empty_info
+        p = div_ssb.find('p')
+        if not p:
+            self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
+            return empty_info
+        txt = ''.join(p.findAll(text=True))
+        txt = txt.replace(',', '')
+        matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
+        if not matches:
+            return empty_info
+        return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
+
+    def _extract_results(self, soup):
+        # Should extract <a href="/url?q=
+        results = soup.findAll('a', href=re.compile("^/url\?q="))
+        #results = soup.findAll('img')
+        ret_res = []
+        for result in results:
+            eres = self._extract_result(result)
+            if eres:
+                ret_res.append(eres)
+        return ret_res
+
+    def _extract_result(self, result):
+        imgsa = result.findAll('img')
+        if len(imgsa)==0:
+            return FaceImageSearchResult("","")
+        else:
+            imgs = imgsa[0]
+        trumnail = imgs['src']
+        image = result['href'][7:]
+        return FaceImageSearchResult(trumnail, image)
+
+    def _extract_title_url(self, result):
+        #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
+        title_a = result.find('a')
+        if not title_a:
+            self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
+            return None, None
+        title = ''.join(title_a.findAll(text=True))
+        title = self._html_unescape(title)
+        url = title_a['href']
+        match = re.match(r'/url\?q=(http[^&]+)&', url)
+        if match:
+            url = urllib.unquote(match.group(1))
+        return title, url
+
+    def _extract_description(self, result):
+        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
+        if not desc_div:
+            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
+            return None
+
+        desc_strs = []
+        def looper(tag):
+            if not tag: return
+            for t in tag:
+                try:
+                    if t.name == 'br': break
+                except AttributeError:
+                    pass
+
+                try:
+                    desc_strs.append(t.string)
+                except AttributeError:
+                    desc_strs.append(t)
+
+        looper(desc_div)
+        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
+
+        desc = ''.join(s for s in desc_strs if s)
+        return self._html_unescape(desc)
+
+    def _html_unescape(self, str):
+        def entity_replacer(m):
+            entity = m.group(1)
+            if entity in name2codepoint:
+                return unichr(name2codepoint[entity])
+            else:
+                return m.group(0)
+
+        def ascii_replacer(m):
+            cp = int(m.group(1))
+            if cp <= 255:
+                return unichr(cp)
+            else:
+                return m.group(0)
+
+        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
+        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
     
 
 class GoogleFaceImageSearch(object):
@@ -641,4 +895,5 @@ def looper(tag):
 
         desc = ''.join(s for s in desc_strs if s)
         return self._html_unescape(desc)
+
         

From 7a8e147171c9710f4d80dba72ee5181c0a0a0203 Mon Sep 17 00:00:00 2001
From: "nikola.milosevic" <nikola.milosevic@prelovac.com>
Date: Tue, 15 Jul 2014 12:22:14 +0100
Subject: [PATCH 5/9] Added Google video search. It requires NLTK

---
 examples/exampleVideoSearch.py |  22 +++
 readme.txt                     |   1 +
 xgoogle/search.py              | 294 ++++++++++++++++++++++++++++++++-
 3 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100755 examples/exampleVideoSearch.py

diff --git a/examples/exampleVideoSearch.py b/examples/exampleVideoSearch.py
new file mode 100755
index 0000000..807585c
--- /dev/null
+++ b/examples/exampleVideoSearch.py
@@ -0,0 +1,22 @@
+#
+# This program does a Google search for video for "Iron Maiden" and returns
+# 50 results. Video search requires NLTK. For instruction on installation please visit http://www.nltk.org/install.html
+#
+
+from xgoogle.search import GoogleVideoSearch, SearchError
+try:
+    gs = GoogleVideoSearch("Iron Maiden")
+    gs.results_per_page = 50
+    results = gs.get_results()
+    for res in results:
+        print 'Name: ' + res.name.encode('utf8')
+        print 'URL: ' + res.url.encode('utf8')
+        print 'Date: ' + res.date.encode('utf8')
+        print 'Duration: ' + res.duration.encode('utf8')
+        print 'Author: ' + res.author.encode('utf8')
+        print 'Description: ' + res.description.encode('utf8')
+        
+        print
+except SearchError, e:
+    print "Search failed: %s" % e
+
diff --git a/readme.txt b/readme.txt
index 6408af5..2d4e929 100755
--- a/readme.txt
+++ b/readme.txt
@@ -191,6 +191,7 @@ v1.2:  * added Google Sets module
 v1.3:  * added Google Translate module
        * fixed a bug in browser.py when KeyboardInterrupt did not get propagated.
 v1.4:  * added Google image and face image search
+	   * added Google video search (requires NLTK, for install instruction see http://www.nltk.org/install.html)
 
 --------------------------------------------------------------------------
 
diff --git a/xgoogle/search.py b/xgoogle/search.py
index 7934afa..41129cf 100755
--- a/xgoogle/search.py
+++ b/xgoogle/search.py
@@ -15,6 +15,7 @@
 import urllib
 from htmlentitydefs import name2codepoint
 from BeautifulSoup import BeautifulSoup
+import nltk
 
 from browser import Browser, BrowserError
 
@@ -42,13 +43,34 @@ def __str__(self):
     def html(self):
         return self.tag.prettify()
     
+    
+#     videoname =  nltk.clean_html(str(h3[0]))
+#         video_url = result.findAll('cite')
+#         date_and_author = result.find('div',{'class':'f slp'})
+#         da = str.split(str(date_and_author,' - Uploaded by'))
+#         date = da[0]
+#         author = da[1]
+#         desc = result.find('span',{'class':'st'})
+#         description = nltk.clean_html(str(desc))
+class FaceVideoSearchResult:
+    def __init__(self, name, url, description,date,duration,author):
+        self.name = name
+        self.url = url
+        self.description = description
+        self.date = date
+        self.duration = duration
+        self.author= author
+
+    def __str__(self):
+        return 'Google Search Result: "%s"' % self.name
+
 class FaceImageSearchResult:
     def __init__(self, trumb, url):
         self.url = url
         self.trumb = trumb
 
     def __str__(self):
-        return 'Google Search Result: "%s"' % self.title
+        return 'Google Search Result: "%s"' % self.trumb
 
 class SearchResult:
     def __init__(self, title, url, desc):
@@ -308,6 +330,276 @@ def ascii_replacer(m):
 
         s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
         return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
+    
+class GoogleVideoSearch(object):
+    SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s"
+    NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&start=%(start)d"
+    SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d"
+    NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"
+
+    def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
+        self.query = query
+        self.debug = debug
+        self.browser = Browser(debug=debug)
+        self.results_info = None
+        self.eor = False # end of results
+        self._page = 0
+        self._first_indexed_in_previous = None
+        self._filetype = None
+        self._last_search_url = None
+        self._results_per_page = 10
+        self._last_from = 0
+        self._lang = lang
+        self._tld = tld
+        
+        if re_search_strings:
+            self._re_search_strings = re_search_strings
+        elif lang == "de":
+            self._re_search_strings = ("Ergebnisse", "von", u"ungefähr")
+        elif lang == "es":
+            self._re_search_strings = ("Resultados", "de", "aproximadamente")
+        # add more localised versions here
+        else:
+            self._re_search_strings = ("Results", "of", "about")
+
+        if random_agent:
+            self.browser.set_random_user_agent()
+
+    @property
+    def num_results(self):
+        if not self.results_info:
+            page = self._get_results_page()
+            self.results_info = self._extract_info(page)
+            if self.results_info['total'] == 0:
+                self.eor = True
+        return self.results_info['total']
+
+    @property
+    def last_search_url(self):
+        return self._last_search_url
+
+    def _get_page(self):
+        return self._page
+
+    def _set_page(self, page):
+        self._page = page
+
+    page = property(_get_page, _set_page)
+
+    def _get_first_indexed_in_previous(self):
+        return self._first_indexed_in_previous
+
+    def _set_first_indexed_in_previous(self, interval):
+        if interval == "day":
+            self._first_indexed_in_previous = 'd'
+        elif interval == "week":
+            self._first_indexed_in_previous = 'w'
+        elif interval == "month":
+            self._first_indexed_in_previous = 'm'
+        elif interval == "year":
+            self._first_indexed_in_previous = 'y'
+        else:
+            # a floating point value is a number of months
+            try:
+                num = float(interval)
+            except ValueError:
+                raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
+            self._first_indexed_in_previous = 'm' + str(interval)
+    
+    first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
+    
+    def _get_filetype(self):
+        return self._filetype
+
+    def _set_filetype(self, filetype):
+        self._filetype = filetype
+    
+    filetype = property(_get_filetype, _set_filetype, doc="file extension to search for")
+    
+    def _get_results_per_page(self):
+        return self._results_per_page
+
+    def _set_results_par_page(self, rpp):
+        self._results_per_page = rpp
+
+    results_per_page = property(_get_results_per_page, _set_results_par_page)
+
+    def get_results(self):
+        """ Gets a page of results """
+        if self.eor:
+            return []
+        MAX_VALUE = 1000000
+        page = self._get_results_page()
+        results = self._extract_results(page)
+        search_info = {'from': self.results_per_page*self._page,
+                       'to': self.results_per_page*self._page + len(results),
+                       'total': MAX_VALUE}
+        if not self.results_info:
+            self.results_info = search_info
+            if self.num_results == 0:
+                self.eor = True
+                return []
+        if not results:
+            self.eor = True
+            return []
+        if self._page > 0 and search_info['from'] == self._last_from:
+            self.eor = True
+            return []
+        if search_info['to'] == search_info['total']:
+            self.eor = True
+        self._page += 1
+        self._last_from = search_info['from']
+        return results
+
+    def _maybe_raise(self, cls, *arg):
+        if self.debug:
+            raise cls(*arg)
+
+    def _get_results_page(self):
+        if self._page == 0:
+            if self._results_per_page == 10:
+                url = GoogleVideoSearch.SEARCH_URL_0
+            else:
+                url = GoogleVideoSearch.SEARCH_URL_1
+        else:
+            if self._results_per_page == 10:
+                url = GoogleVideoSearch.NEXT_PAGE_0
+            else:
+                url = GoogleVideoSearch.NEXT_PAGE_1
+
+        safe_url = [url % { 'query': urllib.quote_plus(self.query),
+                           'start': self._page * self._results_per_page,
+                           'num': self._results_per_page,
+                           'tld' : self._tld,
+                           'lang' : self._lang }]
+        
+        # possibly extend url with optional properties
+        if self._first_indexed_in_previous:
+            safe_url.extend(["&as_qdr=", self._first_indexed_in_previous])
+        if self._filetype:
+            safe_url.extend(["&as_filetype=", self._filetype])
+        
+        safe_url = "".join(safe_url)
+        self._last_search_url = safe_url
+        
+        try:
+            page = self.browser.get_page(safe_url)
+        except BrowserError, e:
+            raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
+
+        return BeautifulSoup(page)
+
+    def _extract_info(self, soup):
+        empty_info = {'from': 0, 'to': 0, 'total': 0}
+        div_ssb = soup.find('div', id='ssb')
+        if not div_ssb:
+            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
+            return empty_info
+        p = div_ssb.find('p')
+        if not p:
+            self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
+            return empty_info
+        txt = ''.join(p.findAll(text=True))
+        txt = txt.replace(',', '')
+        matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
+        if not matches:
+            return empty_info
+        return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
+
+    def _extract_results(self, soup):
+        # Should extract <a href="/url?q=
+        results = soup.findAll('li', {"class" : re.compile(r'\b(g videobox|g)\b')})
+        #results = soup.findAll('img')
+        ret_res = []
+        for result in results:
+            eres = self._extract_result(result)
+            if eres:
+                ret_res.append(eres)
+        return ret_res
+
+    def _extract_result(self, result):
+        
+        h3=result.findAll('h3')
+        name = ''
+        for lonuri in h3:
+            name = name +  str(lonuri)
+        videoname =  nltk.clean_html(str(name))
+        video_url = BeautifulSoup(str(h3)).findAll('a')
+        url = str.split(video_url[0]['href'][7:].encode('utf8'),'&')[0]
+        url = url.replace('%3F', '?')
+        url = url.replace('%3D', '=')
+        desc = result.find('span',{'class':'st'})
+        meta = result.find('span',{'class':'f'})
+        author = ''
+        duration = ''
+        date = ''
+        if(not meta== None):
+            metastr= nltk.clean_html(str(meta))
+            metaarr = metastr.split('-')
+            date = metaarr[0]
+            duration = metaarr[1]
+            author = metaarr[2][13:]
+        description = nltk.clean_html(str(desc))
+        return FaceVideoSearchResult(videoname,url,description,date,duration,author)
+        #return FaceImageSearchResult(trumnail, image)
+
+    def _extract_title_url(self, result):
+        #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
+        title_a = result.find('a')
+        if not title_a:
+            self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
+            return None, None
+        title = ''.join(title_a.findAll(text=True))
+        title = self._html_unescape(title)
+        url = title_a['href']
+        match = re.match(r'/url\?q=(http[^&]+)&', url)
+        if match:
+            url = urllib.unquote(match.group(1))
+        return title, url
+
+    def _extract_description(self, result):
+        desc_div = result.find('div', {'class': re.compile(r'\bs\b')})
+        if not desc_div:
+            self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
+            return None
+
+        desc_strs = []
+        def looper(tag):
+            if not tag: return
+            for t in tag:
+                try:
+                    if t.name == 'br': break
+                except AttributeError:
+                    pass
+
+                try:
+                    desc_strs.append(t.string)
+                except AttributeError:
+                    desc_strs.append(t)
+
+        looper(desc_div)
+        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
+
+        desc = ''.join(s for s in desc_strs if s)
+        return self._html_unescape(desc)
+
+    def _html_unescape(self, str):
+        def entity_replacer(m):
+            entity = m.group(1)
+            if entity in name2codepoint:
+                return unichr(name2codepoint[entity])
+            else:
+                return m.group(0)
+
+        def ascii_replacer(m):
+            cp = int(m.group(1))
+            if cp <= 255:
+                return unichr(cp)
+            else:
+                return m.group(0)
+
+        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
+        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
 
 class GoogleImageSearch(object):
     SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s"

From 6c0fa59bded7d45240bc18cb5e8447a255020098 Mon Sep 17 00:00:00 2001
From: "nikola.milosevic86" <nikola.milosevic86@gmail.com>
Date: Tue, 15 Jul 2014 12:27:38 +0100
Subject: [PATCH 6/9] Comments deleted

---
 xgoogle/search.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/xgoogle/search.py b/xgoogle/search.py
index 41129cf..5252032 100755
--- a/xgoogle/search.py
+++ b/xgoogle/search.py
@@ -507,9 +507,7 @@ def _extract_info(self, soup):
         return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
 
     def _extract_results(self, soup):
-        # Should extract <a href="/url?q=
         results = soup.findAll('li', {"class" : re.compile(r'\b(g videobox|g)\b')})
-        #results = soup.findAll('img')
         ret_res = []
         for result in results:
             eres = self._extract_result(result)

From 9225fe9ecda49644b050ebc2804fa6e74aeb8c9d Mon Sep 17 00:00:00 2001
From: "nikola.milosevic86" <nikola.milosevic86@gmail.com>
Date: Tue, 15 Jul 2014 12:37:18 +0100
Subject: [PATCH 7/9] Setup file updated

---
 setup.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 1027405..6f79f79 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 import sys
 
-__version__ = '1.3'
+__version__ = '1.4'
 
 import os
 def _read(fname):
@@ -14,8 +14,8 @@ def _read(fname):
     long_description=_read('readme.txt'),
     classifiers=[],
     keywords='google search',
-    author='Peteris Krumins',
-    author_email='peter@catonmat.net',
+    author='Peteris Krumins, Nikola Milosevic',
+    author_email='nikola.milosevic@inspiratron.org',
     url='http://github.com/pkrumins/xgoogle',
     license='MIT',
     packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
@@ -25,6 +25,7 @@ def _read(fname):
     include_package_data=True,
     zip_safe=False,
     install_requires=[
+                       'nltk==2.0.4'
         # -*- Extra requirements: -*-
     ],
 )

From 8e60e3cc9c185ce8bc743b613eec6b785aed1d30 Mon Sep 17 00:00:00 2001
From: "nikola.milosevic86" <nikola.milosevic86@gmail.com>
Date: Thu, 17 Jul 2014 16:42:24 +0100
Subject: [PATCH 8/9] Change on one example

---
 examples/ImageExample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ImageExample.py b/examples/ImageExample.py
index d625649..3093f6e 100644
--- a/examples/ImageExample.py
+++ b/examples/ImageExample.py
@@ -5,7 +5,7 @@
 
 from xgoogle.search import GoogleFaceImageSearch, SearchError
 try:
-    gs = GoogleFaceImageSearch("quick and dirty")
+    gs = GoogleFaceImageSearch("Eddard Stark")
     gs.results_per_page = 50
     results = gs.get_results()
     for res in results:

From 13b8d2540e32f03909b232dae686a6ffc397615a Mon Sep 17 00:00:00 2001
From: "nikola.milosevic86" <nikola.milosevic86@gmail.com>
Date: Wed, 10 Sep 2014 19:28:09 +0100
Subject: [PATCH 9/9] Added excerpt to the code of google search. Excerpt is
 the best match found in the searched web page. This is done by Michele
 Filannino and myself

---
 examples/example1.py           |  3 ++-
 examples/exampleVideoSearch.py |  3 +--
 xgoogle/search.py              | 34 +++++++++++++++++++++++++++++++---
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/examples/example1.py b/examples/example1.py
index 79e14cb..5121109 100755
--- a/examples/example1.py
+++ b/examples/example1.py
@@ -6,12 +6,13 @@
 
 from xgoogle.search import GoogleSearch, SearchError
 try:
-  gs = GoogleSearch("quick and dirty")
+  gs = GoogleSearch("game of thrones season 3")
   gs.results_per_page = 50
   results = gs.get_results()
   for res in results:
     print res.title.encode('utf8')
     print res.desc.encode('utf8')
+    print res.excerpt.encode('utf8')
     print res.url.encode('utf8')
     print
 except SearchError, e:
diff --git a/examples/exampleVideoSearch.py b/examples/exampleVideoSearch.py
index 807585c..98910b3 100755
--- a/examples/exampleVideoSearch.py
+++ b/examples/exampleVideoSearch.py
@@ -14,8 +14,7 @@
         print 'Date: ' + res.date.encode('utf8')
         print 'Duration: ' + res.duration.encode('utf8')
         print 'Author: ' + res.author.encode('utf8')
-        print 'Description: ' + res.description.encode('utf8')
-        
+        print 'Description: ' + res.description.encode('utf8')        
         print
 except SearchError, e:
     print "Search failed: %s" % e
diff --git a/xgoogle/search.py b/xgoogle/search.py
index 5252032..4c3283d 100755
--- a/xgoogle/search.py
+++ b/xgoogle/search.py
@@ -73,10 +73,11 @@ def __str__(self):
         return 'Google Search Result: "%s"' % self.trumb
 
 class SearchResult:
-    def __init__(self, title, url, desc):
+    def __init__(self, title, url, desc,excerpt):
         self.title = title
         self.url = url
         self.desc = desc
+        self.excerpt = excerpt
 
     def __str__(self):
         return 'Google Search Result: "%s"' % self.title
@@ -269,9 +270,12 @@ def _extract_results(self, soup):
     def _extract_result(self, result):
         title, url = self._extract_title_url(result)
         desc = self._extract_description(result)
-        if not title or not url or not desc:
+        if desc == None:
+            desc = ''
+        excerpt = self._extract_excerpt(result)
+        if not title or not url or not (desc or excerpt):
             return None
-        return SearchResult(title, url, desc)
+        return SearchResult(title, url, desc,excerpt)
 
     def _extract_title_url(self, result):
         #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
@@ -286,6 +290,30 @@ def _extract_title_url(self, result):
         if match:
             url = urllib.unquote(match.group(1))
         return title, url
+    
+    def _extract_excerpt(self, result):
+        def looper(tag):
+            if not tag: return
+            for t in tag:
+                try:
+                    if t.name == 'br': pass
+                except AttributeError:
+                    pass
+                    try:
+                        desc_strs.append(t.string)
+                    except AttributeError:
+                        desc_strs.append(t)
+        desc_div = result.find('span', {'class': re.compile(r'\bst\b')})
+        if not desc_div:
+            self._maybe_raise(ParseError, "Content excerpt tag in Google search result was not found", result)
+            return None
+        desc_strs = []
+
+        looper(desc_div)
+        looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
+
+        desc = ''.join(s for s in desc_strs if s)
+        return self._html_unescape(desc)
 
     def _extract_description(self, result):
         desc_div = result.find('div', {'class': re.compile(r'\bs\b')})