// Homework 7: Web Scraping // Due March 28, 2017 at 11:59pm package main import ( "fmt" "log" "github.com/PuerkitoBio/goquery" ) func main() { doc, err := goquery.NewDocument("http://uszip.com/zip/19104") if err != nil { log.Fatal(err) } city := doc.Find("div.zip-data hgroup h2 strong").Eq(0).Text() fmt.Println(city) } // News is a Hacker News article listing type News struct { Points int Title string Username string URL string } // NewsSlice is a slice of News pointers type NewsSlice []*News // ScrapeHackerNews scrapes the website "https://news.ycombinator.com/" using goquery and returns // information on the first n posts. // // For each post, the attributes to be extracted are: points, title, username and url. // This data should be returned as a NewsSlice, where NewsSlice is a custom slice of News structs. // // For example, for the sample image located at `https://cis193.com/homeworks/hn.png`, // the struct would look like: // News{24, "QEMU(TCG): user-to-root privesc inside VM via bad translation caching", // "webaholic", "https://bugs.chromium.org/p/project-zero/issues/detail?id=1122"}. // // If n is greater than the number of total posts available (which should be 30), return data from // the all of the available posts (all thirty). func ScrapeHackerNews(n int) NewsSlice { // TODO return nil } // GetEmails returns a string slice of the emails found on the given URL. // // Scenario: you are a student enthusiastic about spreading awareness about Go. To effectively // market Go, you decide to email Penn CIS professors about the wonders of the Go programming // language. In this function, use goquery to extract the email addresses from the URL // "http://www.cis.upenn.edu/about-people/" and return them as a string slice. This will involve you // having to investigate where and how emails are located on the webpage. // Note: you should have 47 total emails returned. func GetEmails() []string { // TODO return nil } // CountryData has GDP information on a country type CountryData struct { Country string GDP string } // GetCountryGDP takes in a string country name and returns the GDP (in millions) as // an integer. Information on the country is found by concurrently scraping a hidden website with // data on countries scattered on many pages. // // Scenario: imagine you are a spy and you have discovered a URL with top secret GDP information: // "https://www.cis.upenn.edu/~cis193/scraping/9828772efc2bd314a277c8880695dea2.html". This webpage // has a country name and the GDP (in millions of US Dollars). It also has links to two other // country's webpages. Based on intelligence you've received, every country has a webpage on this // website with information about it, but you do not know the URL for each page. You can assume that // none of the page links lead you to a cycle and every country can be reached from a path from the // initial URL that you are given. So, for this function, you will need to traverse from the initial // url to every webpage link you encounter in order to find information on the target `country` // string. Since time is of the essence, you want to use concurrency to scrape webpages // simultaneously. Note that for this function, we only care about getting the GDP for the input // `country` string. You may find it useful to use the CountryData struct to send country // information between goroutines. // // To prevent the function from getting stuck if an invalid `country` string is entered, // you should also implement a timeout that will automatically return an error after 10 seconds // if the program hasn't already finished terminating. // // Feel free to make and use helper functions for this function. To help with testing this // function, we know from intelligence reports that the GDP for "Canada" is 1532343 and // the GDP for "Colombia" is 274135. func GetCountryGDP(country string) (int, error) { // TODO return 0, nil }