chore: code cleanup
This commit is contained in:
parent
6cc01f9968
commit
54ab582c20
|
@ -26,28 +26,25 @@ import se.lantz.util.FileManager;
|
||||||
|
|
||||||
public class C64comScraper implements Scraper
|
public class C64comScraper implements Scraper
|
||||||
{
|
{
|
||||||
|
private static final String FRAME_NAME_TEXT = "frame[name=text]";
|
||||||
private static final Logger logger = LoggerFactory.getLogger(C64comScraper.class);
|
private static final Logger logger = LoggerFactory.getLogger(C64comScraper.class);
|
||||||
private String c64comGameUrl = "http://www.c64.com/games/53";
|
private String c64comGameUrl = "";
|
||||||
|
|
||||||
private String screenshotCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > img";
|
|
||||||
|
|
||||||
private String baseForTitleAndYear = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > table > tbody > tr:eq(1) > td";
|
private String baseForTitleAndYear = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > table > tbody > tr:eq(1) > td";
|
||||||
private String titleCssQuery = baseForTitleAndYear + " > span";
|
private String titleCssQuery = baseForTitleAndYear + " > span";
|
||||||
private String yearCssQuery = baseForTitleAndYear + " > a:eq(2) > span";
|
private String yearCssQuery = baseForTitleAndYear + " > a:eq(2) > span";
|
||||||
private String authorCssQuery = baseForTitleAndYear + " > a:eq(3) > span";
|
private String authorCssQuery = baseForTitleAndYear + " > a:eq(3) > span";
|
||||||
|
|
||||||
private String infoTableCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(1) > td > table > tbody > tr > td > table:eq(2) > tbody";
|
private String infoTableCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(1) > td > table > tbody > tr > td > table:eq(2) > tbody";
|
||||||
|
private String screenshotCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > img";
|
||||||
|
|
||||||
private String scrapedTitle;
|
private String scrapedTitle;
|
||||||
private int scrapedYear = 1985;
|
private int scrapedYear = 1985;
|
||||||
private String scrapedAuthor;
|
private String scrapedAuthor;
|
||||||
private List<String> scrapedMusicList = new ArrayList<>();
|
private List<String> scrapedMusicList = new ArrayList<>();
|
||||||
|
|
||||||
private String scrapedGenre;
|
private String scrapedGenre;
|
||||||
private BufferedImage scrapedCover;
|
private BufferedImage scrapedCover;
|
||||||
private File scrapedFile;
|
private File scrapedFile;
|
||||||
|
|
||||||
Map<String, String> genreMap = new HashMap<>();
|
Map<String, String> genreMap = new HashMap<>();
|
||||||
|
|
||||||
public C64comScraper()
|
public C64comScraper()
|
||||||
|
@ -72,7 +69,7 @@ public class C64comScraper implements Scraper
|
||||||
Connection.Response result = Jsoup.connect(url).method(Connection.Method.GET).execute();
|
Connection.Response result = Jsoup.connect(url).method(Connection.Method.GET).execute();
|
||||||
Document doc = result.parse();
|
Document doc = result.parse();
|
||||||
//Fetch right frame
|
//Fetch right frame
|
||||||
Document mainFrameDocument = Jsoup.connect(doc.select("frame[name=text]").first().absUrl("src")).get();
|
Document mainFrameDocument = Jsoup.connect(doc.select(FRAME_NAME_TEXT).first().absUrl("src")).get();
|
||||||
//Fetch title
|
//Fetch title
|
||||||
Elements queryElements = mainFrameDocument.select(titleCssQuery);
|
Elements queryElements = mainFrameDocument.select(titleCssQuery);
|
||||||
Element first = queryElements.first();
|
Element first = queryElements.first();
|
||||||
|
@ -104,52 +101,24 @@ public class C64comScraper implements Scraper
|
||||||
Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute();
|
Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute();
|
||||||
doc = result.parse();
|
doc = result.parse();
|
||||||
//Fetch right frame
|
//Fetch right frame
|
||||||
Document mainFrameDocument = Jsoup.connect(doc.select("frame[name=text]").first().absUrl("src")).get();
|
Document mainFrameDocument = Jsoup.connect(doc.select(FRAME_NAME_TEXT).first().absUrl("src")).get();
|
||||||
|
|
||||||
if (fields.isTitle())
|
if (fields.isTitle())
|
||||||
{
|
{
|
||||||
//Fetch title
|
scrapeTitle(mainFrameDocument);
|
||||||
Elements queryElements = mainFrameDocument.select(titleCssQuery);
|
|
||||||
logger.debug("queryElements = " + queryElements);
|
|
||||||
Element first = queryElements.first();
|
|
||||||
if (first != null)
|
|
||||||
{
|
|
||||||
scrapedTitle = first.text();
|
|
||||||
}
|
|
||||||
logger.debug("scraped title: {}", scrapedTitle);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.isYear())
|
if (fields.isYear())
|
||||||
{
|
{
|
||||||
//Fetch year
|
scrapeYear(mainFrameDocument);
|
||||||
Elements yearElements = mainFrameDocument.select(yearCssQuery);
|
|
||||||
if (yearElements.first() != null)
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
scrapedYear = Integer.parseInt(yearElements.first().text().trim());
|
|
||||||
}
|
|
||||||
catch (Exception e)
|
|
||||||
{
|
|
||||||
logger.error("Could not scrape year for {}", scrapedTitle);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
logger.debug("scraped year: {}", scrapedYear);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.isAuthor())
|
if (fields.isAuthor())
|
||||||
{
|
{
|
||||||
//Fetch author
|
scrapeAuthor(mainFrameDocument);
|
||||||
Elements authorElements = mainFrameDocument.select(authorCssQuery);
|
|
||||||
if (authorElements.first() != null)
|
|
||||||
{
|
|
||||||
scrapedAuthor = authorElements.first().text();
|
|
||||||
}
|
|
||||||
logger.debug("scraped author: {}", scrapedAuthor);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Fetch infotable and find music and genre
|
//Fetch infotable and find music, genre, cover and game
|
||||||
Elements infoElements = mainFrameDocument.select(infoTableCssQuery);
|
Elements infoElements = mainFrameDocument.select(infoTableCssQuery);
|
||||||
if (infoElements.first() != null)
|
if (infoElements.first() != null)
|
||||||
{
|
{
|
||||||
|
@ -174,42 +143,12 @@ public class C64comScraper implements Scraper
|
||||||
}
|
}
|
||||||
if (fields.isCover() && info.startsWith("Inlay"))
|
if (fields.isCover() && info.startsWith("Inlay"))
|
||||||
{
|
{
|
||||||
String url = child.select("td:eq(1) > a").first().attr("href");
|
scrapeCover(child);
|
||||||
//Select the right part
|
|
||||||
url = url.substring(url.indexOf("'")+1);
|
|
||||||
url = url.substring(0, url.indexOf("'"));
|
|
||||||
url = url.substring(url.indexOf("=")+1);
|
|
||||||
url = "http://www.c64.com/games/" + url;
|
|
||||||
URL imageUrl = new URL(url);
|
|
||||||
scrapedCover = ImageIO.read(imageUrl);
|
|
||||||
logger.debug("Cover url: {}", url);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (fields.isGame() && info.equalsIgnoreCase("Download:"))
|
if (fields.isGame() && info.equalsIgnoreCase("Download:"))
|
||||||
{
|
{
|
||||||
Element gameElement = child.select("td:eq(1) > a").first();
|
scrapeGame(child);
|
||||||
if (gameElement.text().equalsIgnoreCase("Game"))
|
|
||||||
{
|
|
||||||
try
|
|
||||||
{
|
|
||||||
String url = gameElement.attr("abs:href");
|
|
||||||
Response response = Jsoup.connect(url)
|
|
||||||
.header("Accept-Encoding", "gzip, deflate")
|
|
||||||
.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0")
|
|
||||||
.ignoreContentType(true)
|
|
||||||
.maxBodySize(0)
|
|
||||||
.timeout(600000)
|
|
||||||
.execute();
|
|
||||||
|
|
||||||
//create a temp file and fetch the content
|
|
||||||
scrapedFile = FileManager.createTempFileForScraper(response.bodyStream());
|
|
||||||
logger.debug("File to include as game: {}", scrapedFile != null ? scrapedFile.getAbsolutePath() : null);
|
|
||||||
}
|
|
||||||
catch (Exception e)
|
|
||||||
{
|
|
||||||
logger.error("Could not scrape game file for " + scrapedTitle , e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -220,6 +159,97 @@ public class C64comScraper implements Scraper
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void scrapeTitle(Document mainFrameDocument)
|
||||||
|
{
|
||||||
|
//Fetch title
|
||||||
|
Elements queryElements = mainFrameDocument.select(titleCssQuery);
|
||||||
|
logger.debug("queryElements = {}", queryElements);
|
||||||
|
Element first = queryElements.first();
|
||||||
|
if (first != null)
|
||||||
|
{
|
||||||
|
scrapedTitle = first.text();
|
||||||
|
}
|
||||||
|
logger.debug("scraped title: {}", scrapedTitle);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void scrapeYear(Document mainFrameDocument)
|
||||||
|
{
|
||||||
|
//Fetch year
|
||||||
|
Elements yearElements = mainFrameDocument.select(yearCssQuery);
|
||||||
|
if (yearElements.first() != null)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
scrapedYear = Integer.parseInt(yearElements.first().text().trim());
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
logger.error("Could not scrape year for {}", scrapedTitle);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
logger.debug("scraped year: {}", scrapedYear);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void scrapeAuthor(Document mainFrameDocument)
|
||||||
|
{
|
||||||
|
//Fetch author
|
||||||
|
Elements authorElements = mainFrameDocument.select(authorCssQuery);
|
||||||
|
if (authorElements.first() != null)
|
||||||
|
{
|
||||||
|
scrapedAuthor = authorElements.first().text();
|
||||||
|
}
|
||||||
|
logger.debug("scraped author: {}", scrapedAuthor);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void scrapeCover(Element element)
|
||||||
|
{
|
||||||
|
String url = element.select("td:eq(1) > a").first().attr("href");
|
||||||
|
//Select the right part
|
||||||
|
url = url.substring(url.indexOf("'")+1);
|
||||||
|
url = url.substring(0, url.indexOf("'"));
|
||||||
|
url = url.substring(url.indexOf("=")+1);
|
||||||
|
url = "http://www.c64.com/games/" + url;
|
||||||
|
URL imageUrl;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
imageUrl = new URL(url);
|
||||||
|
scrapedCover = ImageIO.read(imageUrl);
|
||||||
|
}
|
||||||
|
catch (IOException e)
|
||||||
|
{
|
||||||
|
logger.error("Could not scrape cover for " + scrapedTitle , e);
|
||||||
|
}
|
||||||
|
logger.debug("Cover url: {}", url);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void scrapeGame(Element element)
|
||||||
|
{
|
||||||
|
Element gameElement = element.select("td:eq(1) > a").first();
|
||||||
|
if (gameElement.text().equalsIgnoreCase("Game"))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
String url = gameElement.attr("abs:href");
|
||||||
|
Response response = Jsoup.connect(url)
|
||||||
|
.header("Accept-Encoding", "gzip, deflate")
|
||||||
|
.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0")
|
||||||
|
.ignoreContentType(true)
|
||||||
|
.maxBodySize(0)
|
||||||
|
.timeout(600000)
|
||||||
|
.execute();
|
||||||
|
|
||||||
|
//create a temp file and fetch the content
|
||||||
|
scrapedFile = FileManager.createTempFileForScraper(response.bodyStream());
|
||||||
|
logger.debug("File to include as game: {}", scrapedFile != null ? scrapedFile.getAbsolutePath() : null);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
logger.error("Could not scrape game file for " + scrapedTitle , e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private String mapGenre(String genreFromC64com)
|
private String mapGenre(String genreFromC64com)
|
||||||
{
|
{
|
||||||
//Map towards available genres, return first one found
|
//Map towards available genres, return first one found
|
||||||
|
@ -227,7 +257,6 @@ public class C64comScraper implements Scraper
|
||||||
{
|
{
|
||||||
if (entry.getKey().contains(genreFromC64com))
|
if (entry.getKey().contains(genreFromC64com))
|
||||||
{
|
{
|
||||||
logger.debug(entry.getKey() + "/" + entry.getValue());
|
|
||||||
return entry.getValue();
|
return entry.getValue();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -244,7 +273,7 @@ public class C64comScraper implements Scraper
|
||||||
Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute();
|
Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute();
|
||||||
doc = result.parse();
|
doc = result.parse();
|
||||||
//Fetch right frame
|
//Fetch right frame
|
||||||
Document mainFrameDocument = Jsoup.connect(doc.select("frame[name=text]").first().absUrl("src")).get();
|
Document mainFrameDocument = Jsoup.connect(doc.select(FRAME_NAME_TEXT).first().absUrl("src")).get();
|
||||||
//Fetch the right element
|
//Fetch the right element
|
||||||
Elements coverElements = mainFrameDocument.select(screenshotCssQuery);
|
Elements coverElements = mainFrameDocument.select(screenshotCssQuery);
|
||||||
if (coverElements.first() != null)
|
if (coverElements.first() != null)
|
||||||
|
|
|
@ -268,7 +268,6 @@ public class MobyGamesScraper implements Scraper
|
||||||
if (node instanceof TextNode)
|
if (node instanceof TextNode)
|
||||||
{
|
{
|
||||||
String test = ((TextNode)node).text();
|
String test = ((TextNode)node).text();
|
||||||
//TODO: Add more possible labels
|
|
||||||
if (test.contains("Music") || test.contains("music"))
|
if (test.contains("Music") || test.contains("music"))
|
||||||
{
|
{
|
||||||
musicFound = true;
|
musicFound = true;
|
||||||
|
|
|
@ -6,10 +6,11 @@ import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import se.lantz.model.data.ScraperFields;
|
import se.lantz.model.data.ScraperFields;
|
||||||
|
/**
|
||||||
|
* Common interface implemented by all scrapers.
|
||||||
|
*/
|
||||||
public interface Scraper
|
public interface Scraper
|
||||||
{
|
{
|
||||||
|
|
||||||
void connect(String url) throws IOException;
|
void connect(String url) throws IOException;
|
||||||
|
|
||||||
void scrapeInformation(ScraperFields fields);
|
void scrapeInformation(ScraperFields fields);
|
||||||
|
@ -33,5 +34,4 @@ public interface Scraper
|
||||||
boolean isC64();
|
boolean isC64();
|
||||||
|
|
||||||
File getGameFile();
|
File getGameFile();
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue