diff --git a/.gitignore b/.gitignore index c5c62a8..c8f6f70 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,9 @@ # vendor/ coverage.txt +# ignore binaries +stashbox +stashbox.exe + +# ignore default archivbe location stashDb/ diff --git a/cmd/stashbox/main.go b/cmd/stashbox/main.go index 4932f56..e722b0d 100644 --- a/cmd/stashbox/main.go +++ b/cmd/stashbox/main.go @@ -63,7 +63,7 @@ func main() { panic(err) } - err = c.AddUrl(*url) + err = c.AddURL(*url) if err != nil { panic(err) } diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 44ad35f..be12003 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -23,9 +23,9 @@ import ( // Site ... type Site struct { - HtmlBody []byte + HTMLBody []byte TextBody []byte - Url string + URL string Title string } @@ -38,7 +38,7 @@ type Crawler struct { // Error Messages var ( - errNoTitleInHtml = errors.New("No title tag in HTML response") + errNoTitleInHTML = errors.New("No title tag in HTML response") // regular expression from: https://mathiasbynens.be/demo/url-regex, // by @imme_emosol urlRegExp, _ = regexp.Compile(`^(https|http)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$`) @@ -57,7 +57,7 @@ func (c *Crawler) Save() error { // save all sites one by one for _, s := range c.Sites { - fmt.Printf("Saving %s...\n", s.Url) + fmt.Printf("Saving %s...\n", s.URL) if err := c.saveSite(s); err != nil { return err } @@ -68,7 +68,7 @@ func (c *Crawler) Save() error { func (c *Crawler) saveSite(s Site) error { dateTime := dateTimeFileName() - domainSubPath, err := buildPath(c.Archive, s.Url) + domainSubPath, err := buildPath(c.Archive, s.URL) if err != nil { return err } @@ -81,7 +81,7 @@ func (c *Crawler) saveSite(s Site) error { // save the html htmlFileName := fmt.Sprintf("%s.html", dateTime) htmlSavePath := path.Join(domainSubPath, htmlFileName) - err = ioutil.WriteFile(htmlSavePath, s.HtmlBody, 0600) + err = ioutil.WriteFile(htmlSavePath, s.HTMLBody, 0600) if err != nil { return err } @@ -97,7 +97,7 @@ func (c *Crawler) saveSite(s Site) error { // save the pdf pdfFileName := fmt.Sprintf("%s.pdf", dateTime) pdfSavePath := path.Join(domainSubPath, pdfFileName) - if err := generatePDF(pdfSavePath, s.Url); err != nil { + if err := generatePDF(pdfSavePath, s.URL); err != nil { return err } return nil @@ -128,8 +128,8 @@ func dateTimeFileName() string { return t.Format(timestamp) } -// AddUrl ... -func (c *Crawler) AddUrl(url string) error { +// AddURL will add the url to our list of urls +func (c *Crawler) AddURL(url string) error { url = strings.TrimSpace(url) if len(url) == 0 { return errors.New("URL can't be empty or only containing space") @@ -155,10 +155,10 @@ func createSiteFilename(url string, htmlBody []byte) (string, error) { forbiddenCharactersWindows := [...]rune{'/', '<', '>', ':', '"', '\\', '|', '?', '*'} reservedFilenamesWindows := [...]string{"CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9"} - title, err := getHtmlTitle(htmlBody) + title, err := getHTMLTitle(htmlBody) // if there is no title, do old way of creating hash - if err == errNoTitleInHtml { + if err == errNoTitleInHTML { h := sha256.New() _, err = io.WriteString(h, url) if err != nil { @@ -194,7 +194,7 @@ func (c *Crawler) Crawl() error { var site Site - htmlBody, err := getHtmlBody(u) + htmlBody, err := getHTMLBody(u) if err != nil { return err } @@ -212,7 +212,7 @@ func (c *Crawler) Crawl() error { } site.TextBody = textBody - site.Url = u + site.URL = u c.Sites = append(c.Sites, site) } @@ -220,7 +220,7 @@ func (c *Crawler) Crawl() error { return nil } -func getHtmlTitle(body []byte) (title string, err error) { +func getHTMLTitle(body []byte) (title string, err error) { // HTML DOM Document r := bytes.NewReader(body) @@ -232,13 +232,13 @@ func getHtmlTitle(body []byte) (title string, err error) { titleTag := doc.Find("title").First() if titleTag.Size() == 0 { - return "", errNoTitleInHtml + return "", errNoTitleInHTML } return titleTag.Text(), nil } -func getHtmlBody(url string) (body []byte, err error) { +func getHTMLBody(url string) (body []byte, err error) { // #nosec - gosec will detect this as a G107 error // the point of this function *is* to accept a variable URL resp, err := http.Get(url) diff --git a/pkg/crawler/crawler_test.go b/pkg/crawler/crawler_test.go index 5eea8a5..90258da 100644 --- a/pkg/crawler/crawler_test.go +++ b/pkg/crawler/crawler_test.go @@ -10,9 +10,9 @@ func TestGetHtmlTitle(t *testing.T) { const url = "https://github.com/zpeters/stashbox" const want = "GitHub - zpeters/stashbox: Your personal Internet Archive" - body, err := getHtmlBody(url) + body, err := getHTMLBody(url) handleErr(t, err) - title, err := getHtmlTitle(body) + title, err := getHTMLTitle(body) handleErr(t, err) if title != want { t.Errorf("Wrong title found. Want: %s, Got : %s", want, title) @@ -28,7 +28,7 @@ func TestAddUrl(t *testing.T) { for i := 1; i <= count; i++ { url := "https://www.github.com" + strconv.Itoa(i) - err = c.AddUrl(url) + err = c.AddURL(url) if err != nil { t.Errorf("Test case for url: '" + url + "' failed; it should pass; error:" + err.Error()) } @@ -65,7 +65,7 @@ func TestCrawl(t *testing.T) { } for _, s := range crawlSites { - err = c.AddUrl(s) + err = c.AddURL(s) handleErr(t, err) } err = c.Crawl()