chore: code cleanup

This commit is contained in:
mikael.lantz 2021-01-15 11:41:32 +01:00
parent 6cc01f9968
commit 54ab582c20
3 changed files with 108 additions and 80 deletions

View File

@ -26,28 +26,25 @@ import se.lantz.util.FileManager;
public class C64comScraper implements Scraper public class C64comScraper implements Scraper
{ {
private static final String FRAME_NAME_TEXT = "frame[name=text]";
private static final Logger logger = LoggerFactory.getLogger(C64comScraper.class); private static final Logger logger = LoggerFactory.getLogger(C64comScraper.class);
private String c64comGameUrl = "http://www.c64.com/games/53"; private String c64comGameUrl = "";
private String screenshotCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > img";
private String baseForTitleAndYear = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > table > tbody > tr:eq(1) > td"; private String baseForTitleAndYear = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > table > tbody > tr:eq(1) > td";
private String titleCssQuery = baseForTitleAndYear + " > span"; private String titleCssQuery = baseForTitleAndYear + " > span";
private String yearCssQuery = baseForTitleAndYear + " > a:eq(2) > span"; private String yearCssQuery = baseForTitleAndYear + " > a:eq(2) > span";
private String authorCssQuery = baseForTitleAndYear + " > a:eq(3) > span"; private String authorCssQuery = baseForTitleAndYear + " > a:eq(3) > span";
private String infoTableCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(1) > td > table > tbody > tr > td > table:eq(2) > tbody"; private String infoTableCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(1) > td > table > tbody > tr > td > table:eq(2) > tbody";
private String screenshotCssQuery = "html > body > table > tbody > tr > td:eq(0) > table > tbody > tr:eq(1) > td > table:eq(1) > tbody > tr > td:eq(4) > table > tbody > tr:eq(0) > td > img";
private String scrapedTitle; private String scrapedTitle;
private int scrapedYear = 1985; private int scrapedYear = 1985;
private String scrapedAuthor; private String scrapedAuthor;
private List<String> scrapedMusicList = new ArrayList<>(); private List<String> scrapedMusicList = new ArrayList<>();
private String scrapedGenre; private String scrapedGenre;
private BufferedImage scrapedCover; private BufferedImage scrapedCover;
private File scrapedFile; private File scrapedFile;
Map<String, String> genreMap = new HashMap<>(); Map<String, String> genreMap = new HashMap<>();
public C64comScraper() public C64comScraper()
@ -72,7 +69,7 @@ public class C64comScraper implements Scraper
Connection.Response result = Jsoup.connect(url).method(Connection.Method.GET).execute(); Connection.Response result = Jsoup.connect(url).method(Connection.Method.GET).execute();
Document doc = result.parse(); Document doc = result.parse();
//Fetch right frame //Fetch right frame
Document mainFrameDocument = Jsoup.connect(doc.select("frame[name=text]").first().absUrl("src")).get(); Document mainFrameDocument = Jsoup.connect(doc.select(FRAME_NAME_TEXT).first().absUrl("src")).get();
//Fetch title //Fetch title
Elements queryElements = mainFrameDocument.select(titleCssQuery); Elements queryElements = mainFrameDocument.select(titleCssQuery);
Element first = queryElements.first(); Element first = queryElements.first();
@ -104,52 +101,24 @@ public class C64comScraper implements Scraper
Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute(); Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute();
doc = result.parse(); doc = result.parse();
//Fetch right frame //Fetch right frame
Document mainFrameDocument = Jsoup.connect(doc.select("frame[name=text]").first().absUrl("src")).get(); Document mainFrameDocument = Jsoup.connect(doc.select(FRAME_NAME_TEXT).first().absUrl("src")).get();
if (fields.isTitle()) if (fields.isTitle())
{ {
//Fetch title scrapeTitle(mainFrameDocument);
Elements queryElements = mainFrameDocument.select(titleCssQuery);
logger.debug("queryElements = " + queryElements);
Element first = queryElements.first();
if (first != null)
{
scrapedTitle = first.text();
}
logger.debug("scraped title: {}", scrapedTitle);
} }
if (fields.isYear()) if (fields.isYear())
{ {
//Fetch year scrapeYear(mainFrameDocument);
Elements yearElements = mainFrameDocument.select(yearCssQuery);
if (yearElements.first() != null)
{
try
{
scrapedYear = Integer.parseInt(yearElements.first().text().trim());
}
catch (Exception e)
{
logger.error("Could not scrape year for {}", scrapedTitle);
}
}
logger.debug("scraped year: {}", scrapedYear);
} }
if (fields.isAuthor()) if (fields.isAuthor())
{ {
//Fetch author scrapeAuthor(mainFrameDocument);
Elements authorElements = mainFrameDocument.select(authorCssQuery);
if (authorElements.first() != null)
{
scrapedAuthor = authorElements.first().text();
}
logger.debug("scraped author: {}", scrapedAuthor);
} }
//Fetch infotable and find music and genre //Fetch infotable and find music, genre, cover and game
Elements infoElements = mainFrameDocument.select(infoTableCssQuery); Elements infoElements = mainFrameDocument.select(infoTableCssQuery);
if (infoElements.first() != null) if (infoElements.first() != null)
{ {
@ -174,42 +143,12 @@ public class C64comScraper implements Scraper
} }
if (fields.isCover() && info.startsWith("Inlay")) if (fields.isCover() && info.startsWith("Inlay"))
{ {
String url = child.select("td:eq(1) > a").first().attr("href"); scrapeCover(child);
//Select the right part
url = url.substring(url.indexOf("'")+1);
url = url.substring(0, url.indexOf("'"));
url = url.substring(url.indexOf("=")+1);
url = "http://www.c64.com/games/" + url;
URL imageUrl = new URL(url);
scrapedCover = ImageIO.read(imageUrl);
logger.debug("Cover url: {}", url);
continue; continue;
} }
if (fields.isGame() && info.equalsIgnoreCase("Download:")) if (fields.isGame() && info.equalsIgnoreCase("Download:"))
{ {
Element gameElement = child.select("td:eq(1) > a").first(); scrapeGame(child);
if (gameElement.text().equalsIgnoreCase("Game"))
{
try
{
String url = gameElement.attr("abs:href");
Response response = Jsoup.connect(url)
.header("Accept-Encoding", "gzip, deflate")
.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0")
.ignoreContentType(true)
.maxBodySize(0)
.timeout(600000)
.execute();
//create a temp file and fetch the content
scrapedFile = FileManager.createTempFileForScraper(response.bodyStream());
logger.debug("File to include as game: {}", scrapedFile != null ? scrapedFile.getAbsolutePath() : null);
}
catch (Exception e)
{
logger.error("Could not scrape game file for " + scrapedTitle , e);
}
}
} }
} }
} }
@ -220,6 +159,97 @@ public class C64comScraper implements Scraper
} }
} }
private void scrapeTitle(Document mainFrameDocument)
{
//Fetch title
Elements queryElements = mainFrameDocument.select(titleCssQuery);
logger.debug("queryElements = {}", queryElements);
Element first = queryElements.first();
if (first != null)
{
scrapedTitle = first.text();
}
logger.debug("scraped title: {}", scrapedTitle);
}
private void scrapeYear(Document mainFrameDocument)
{
//Fetch year
Elements yearElements = mainFrameDocument.select(yearCssQuery);
if (yearElements.first() != null)
{
try
{
scrapedYear = Integer.parseInt(yearElements.first().text().trim());
}
catch (Exception e)
{
logger.error("Could not scrape year for {}", scrapedTitle);
}
}
logger.debug("scraped year: {}", scrapedYear);
}
private void scrapeAuthor(Document mainFrameDocument)
{
//Fetch author
Elements authorElements = mainFrameDocument.select(authorCssQuery);
if (authorElements.first() != null)
{
scrapedAuthor = authorElements.first().text();
}
logger.debug("scraped author: {}", scrapedAuthor);
}
private void scrapeCover(Element element)
{
String url = element.select("td:eq(1) > a").first().attr("href");
//Select the right part
url = url.substring(url.indexOf("'")+1);
url = url.substring(0, url.indexOf("'"));
url = url.substring(url.indexOf("=")+1);
url = "http://www.c64.com/games/" + url;
URL imageUrl;
try
{
imageUrl = new URL(url);
scrapedCover = ImageIO.read(imageUrl);
}
catch (IOException e)
{
logger.error("Could not scrape cover for " + scrapedTitle , e);
}
logger.debug("Cover url: {}", url);
}
private void scrapeGame(Element element)
{
Element gameElement = element.select("td:eq(1) > a").first();
if (gameElement.text().equalsIgnoreCase("Game"))
{
try
{
String url = gameElement.attr("abs:href");
Response response = Jsoup.connect(url)
.header("Accept-Encoding", "gzip, deflate")
.userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0")
.ignoreContentType(true)
.maxBodySize(0)
.timeout(600000)
.execute();
//create a temp file and fetch the content
scrapedFile = FileManager.createTempFileForScraper(response.bodyStream());
logger.debug("File to include as game: {}", scrapedFile != null ? scrapedFile.getAbsolutePath() : null);
}
catch (Exception e)
{
logger.error("Could not scrape game file for " + scrapedTitle , e);
}
}
}
private String mapGenre(String genreFromC64com) private String mapGenre(String genreFromC64com)
{ {
//Map towards available genres, return first one found //Map towards available genres, return first one found
@ -227,7 +257,6 @@ public class C64comScraper implements Scraper
{ {
if (entry.getKey().contains(genreFromC64com)) if (entry.getKey().contains(genreFromC64com))
{ {
logger.debug(entry.getKey() + "/" + entry.getValue());
return entry.getValue(); return entry.getValue();
} }
} }
@ -244,7 +273,7 @@ public class C64comScraper implements Scraper
Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute(); Connection.Response result = Jsoup.connect(c64comGameUrl).method(Connection.Method.GET).execute();
doc = result.parse(); doc = result.parse();
//Fetch right frame //Fetch right frame
Document mainFrameDocument = Jsoup.connect(doc.select("frame[name=text]").first().absUrl("src")).get(); Document mainFrameDocument = Jsoup.connect(doc.select(FRAME_NAME_TEXT).first().absUrl("src")).get();
//Fetch the right element //Fetch the right element
Elements coverElements = mainFrameDocument.select(screenshotCssQuery); Elements coverElements = mainFrameDocument.select(screenshotCssQuery);
if (coverElements.first() != null) if (coverElements.first() != null)

View File

@ -268,7 +268,6 @@ public class MobyGamesScraper implements Scraper
if (node instanceof TextNode) if (node instanceof TextNode)
{ {
String test = ((TextNode)node).text(); String test = ((TextNode)node).text();
//TODO: Add more possible labels
if (test.contains("Music") || test.contains("music")) if (test.contains("Music") || test.contains("music"))
{ {
musicFound = true; musicFound = true;

View File

@ -6,10 +6,11 @@ import java.io.IOException;
import java.util.List; import java.util.List;
import se.lantz.model.data.ScraperFields; import se.lantz.model.data.ScraperFields;
/**
* Common interface implemented by all scrapers.
*/
public interface Scraper public interface Scraper
{ {
void connect(String url) throws IOException; void connect(String url) throws IOException;
void scrapeInformation(ScraperFields fields); void scrapeInformation(ScraperFields fields);
@ -33,5 +34,4 @@ public interface Scraper
boolean isC64(); boolean isC64();
File getGameFile(); File getGameFile();
} }