package org.jsoup.integration;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import org.junit.Ignore;
import java.io.*;
import java.net.URISyntaxException;
import static org.junit.Assert.*;
/**
* Integration test: parses from real-world example HTML.
*
* @author Jonathan Hedley, jonathan@hedley.net
*/
public class ParseTest {
@Test
public void testWikipediaArticle() throws IOException {
File in = getFile("/htmltests/wikipedia-article-1.html");
Document doc = Jsoup.parse(in, "UTF-8",
"http://en.wikipedia.org/wiki/Debian");
assertEquals("Kepler’s laws of planetary motion - Wikipedia, the free encyclopedia",
doc.title()); // note that the apos in the source is a literal ’ (8217), not escaped or '
assertEquals("en", doc.select("html").attr("lang"));
Elements thumbInner = doc.select(".thumbinner > *");
assertEquals(10, thumbInner.size());
assertEquals("Kepler's laws of planetary motion", doc.select(".mw-body h1").text().trim());
Element a = doc.select("a[href=/wiki/File:Kepler_laws_diagram.svg]").first();
assertEquals("/wiki/File:Kepler_laws_diagram.svg", a.attr("href"));
assertEquals("http://en.wikipedia.org/wiki/File:Kepler_laws_diagram.svg", a.attr("abs:href"));
Element hs = doc.select("a[href*=stargaze]").first();
assertEquals(
"http://www-istp.gsfc.nasa.gov/stargaze/Skeplaws.htm",
hs.attr("href"));
assertEquals(hs.attr("href"), hs.attr("abs:href"));
Elements results = doc.select("span.reference-text > a");
assertEquals(15, results.size());
assertEquals("http://demonstrations.wolfram.com/KeplersSecondLaw/",
results.get(0).attr("href"));
assertEquals("http://plato.stanford.edu/archives/win2008/entries/newton-principia/",
results.get(1).attr("href"));
a = doc.select("a[href=//ja.wikipedia.org/wiki/%E3%82%B1%E3%83%97%E3%83%A9%E3%83%BC%E3%81%AE%E6%B3%95%E5%89%87]").first();
assertEquals("日本語", a.text());
Element p = doc.select("p:contains(different parts in its orbit").first();
assertEquals("Now as the first law states that the planet follows an ellipse, the planet is at different distances from the Sun at different parts in its orbit. So the planet has to move faster when it is closer to the Sun so that it sweeps equal areas in equal times.", p.text());
}
@Test
@Ignore
public void testSmhBizArticle() throws IOException {
File in = getFile("/htmltests/smh-biz-article-1.html");
Document doc = Jsoup.parse(in, "UTF-8",
"http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html");
assertEquals("The board’s next fear: the female quota",
doc.title()); // note that the apos in the source is a literal ’ (8217), not escaped or '
assertEquals("en", doc.select("html").attr("xml:lang"));
Elements articleBody = doc.select(".articleBody > *");
assertEquals(17, articleBody.size());
// todo: more tests!
}
@Test
@Ignore
public void testNewsHomepage() throws IOException {
File in = getFile("/htmltests/news-com-au-home.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());
Element a = doc.select("a[href=/entertainment/horoscopes]").first();
assertEquals("/entertainment/horoscopes", a.attr("href"));
assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href"));
Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first();
assertEquals(
"http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003",
hs.attr("href"));
assertEquals(hs.attr("href"), hs.attr("abs:href"));
}
@Test
@Ignore
public void testGoogleSearchIpod() throws IOException {
File in = getFile("/htmltests/google-ipod.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10");
assertEquals("ipod - Google Search", doc.title());
Elements results = doc.select("h3.r > a");
assertEquals(12, results.size());
assertEquals(
"http://news.google.com/news?hl=en&q=ipod&um=1&ie=UTF-8&ei=uYlKS4SbBoGg6gPf-5XXCw&sa=X&oi=news_group&ct=title&resnum=1&ved=0CCIQsQQwAA",
results.get(0).attr("href"));
assertEquals("http://www.apple.com/itunes/",
results.get(1).attr("href"));
}
@Test
public void testBinary() throws IOException {
File in = getFile("/htmltests/thumb.jpg");
Document doc = Jsoup.parse(in, "UTF-8");
// nothing useful, but did not blow up
assertTrue(doc.text().contains("gd-jpeg"));
}
@Test
@Ignore
public void testYahooJp() throws IOException {
File in = getFile("/htmltests/yahoo-jp.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html"); // http charset is utf-8.
assertEquals("Yahoo! JAPAN", doc.title());
Element a = doc.select("a[href=t/2322m2]").first();
assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2",
a.attr("abs:href")); // session put into