I have a difficult time testing the class below. At this point, I would have to mock Jsoup calls for the many methods that extract a single article component (headline, lead etc.).
I managed to solve it by extracting all the functionality related to scrape(Selector) into a separate class. The problem is, it doesn't belong in a separate class. I think this might be the code smell "feature envy"?
The class when the functionality is not yet extracted:
public abstract class NewsPaperArticleScraper implements ArticleScraper {
private final static Medium medium = NEWS_PAPER_WEBSITE;
private ArticleComponentSelectorContainer selector;
private Document articleDocument;
private MD5HashCalculator hashCalculator;
private Company company;
public NewsPaperArticleScraper(ArticleComponentSelectorContainer selector, MD5HashCalculator hashCalculator, Company company) {
this.selector = selector;
this.hashCalculator = hashCalculator;
this.company = company;
}
public Article scrape(Document document, Category category, int orderOfAppearance) {
if (document == null) {
return null;
}
this.articleDocument = document;
String headline = getHeadline();
String body = getBody();
String hash = hashCalculator.hash(headline + body);
return article()
.withUrl(getUr())
.withHash(hash)
.withHeadline(headline)
.withOrderOfAppearance(orderOfAppearance)
.withSubtTitle(getSubTitle())
.withLead(getLead())
.withBody(body)
.withQuotations(getQuotations())
.withAuthor(getAuthor())
.withCompany(company)
.withCategories(getCategories(category))
.withSubjects(getSubjects())
.withImages(getImages())
.withSources(getSources())
.withFetchDate(LocalDate.now())
.withPublishedDate(getPublishedDate())
.withSubArticles(getSubArticles())
.withMedium(medium)
.build();
}
private String scrape(Selector selector) {
if (selector.hasTagOnly()) {
return scrapeByTag(selector.getTag());
} else {
return scrapeByTagAndAttribute(selector.getTag(), selector.getAttr());
}
}
private String scrapeByTag(String tag) {
Elements elements = articleDocument.select(tag);
if (elements.isEmpty()) {
return "";
}
if (elements.size() == 1) {
return elements.first().text().trim();
} else {
return createParagraphedText(elements);
}
}
private String createParagraphedText(Elements elements) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < elements.size() - 1; i++) {
sb.append(getTextWithTrailingBlankLine(elements.get(i)));
}
sb.append(getTextLastElement(elements));
return sb.toString();
}
private String getTextWithTrailingBlankLine(Element element) {
return element.text().trim() + System.lineSeparator() + System.lineSeparator();
}
private String getTextLastElement(Elements elements) {
return elements.last().text().trim();
}
private String scrapeByTagAndAttribute(String tag, String attr) {
return articleDocument.select(tag).attr(attr);
}
// Value setting methods for the article that's being assembled in scrape(document, ...).
private String getUr() {
return articleDocument.baseUri();
}
private String getHeadline() {
return scrape(selector.forHeadline());
}
private String getSubTitle() {
return scrape(selector.forSubtitle());
}
private String getLead() {
return scrape(selector.forLead());
}
private String getBody() {
return scrape(selector.forBody());
}
protected String getAuthor() {
return scrape(selector.forAuthor());
}
private LocalDate getPublishedDate() {
throw new NotImplementedException();
}
protected abstract Set<Source> getSources();
protected abstract Set<Category> getCategories(Category category);
protected abstract List<Quotation> getQuotations();
protected abstract List<Image> getImages();
protected abstract List<Subject> getSubjects();
protected abstract List<SubArticle> getSubArticles();
}
Aucun commentaire:
Enregistrer un commentaire