Summary
I have a Python based web scraping pet project that I'm trying to implement some TDD in, but I quickly run into a problem. The unit tests require an internet connection, as well as downloading of html text. While I understand that the actual parsing can be done with a local file, some methods are used to simply redefine the URL and query the website again. This seems to break some of the best practices for TDD (citation: Clean Code by Robert Martin claims that tests should be runnable in any environment). While this is a Python project, I ran into a similar issue using R for Yahoo Finance scraping, and I'm sure this kind of thing is language agnostic. At the very least, this problem seems to violate a major guideline in TDD, which is that the tests should run fast.
tldr; Are there any best practices for handling network connections in TDD?
Reproducible Example
AbstractScraper.py
from urllib.request import urlopen
from bs4 import BeautifulSoup
class AbstractScraper:
def __init__(self, url):
self.url = url
self.dataDictionary = None
def makeDataDictionary(self):
html = urlopen(self.url)
text = html.read().decode("utf-8")
soup = BeautifulSoup(text, "lxml")
self.dataDictionary = {"html": html, "text": text, "soup": soup}
def writeSoup(self, path):
with open(path, "w") as outfile:
outfile.write(self.dataDictionary["soup"].prettify())
TestAbstractScraper.py
import unittest
from http.client import HTTPResponse
from bs4 import BeautifulSoup
from CrackedScrapeProject.scrape.AbstractScraper import AbstractScraper
from io import StringIO
class TestAbstractScraperMethods(unittest.TestCase):
def setUp(self):
self.scraper = AbstractScraper("http://ift.tt/1f4Af77")
self.scraper.makeDataDictionary()
def test_dataDictionaryContents(self):
self.assertTrue(isinstance(self.scraper.dataDictionary, dict))
self.assertTrue(isinstance(self.scraper.dataDictionary["html"], HTTPResponse))
self.assertTrue(isinstance(self.scraper.dataDictionary["text"], str))
self.assertTrue(isinstance(self.scraper.dataDictionary["soup"], BeautifulSoup))
self.assertSetEqual(set(self.scraper.dataDictionary.keys()), set(["text", "soup", "html"]))
def test_writeSoup(self):
filePath = "C:/users/athompson/desktop/testFile.html"
self.scraper.writeSoup(filePath)
self.writtenData = open(filePath, "r").read()
self.assertEqual(self.writtenData, self.scraper.dataDictionary["soup"].prettify())
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestAbstractScraperMethods)
unittest.TextTestRunner(verbosity=2).run(suite)
Aucun commentaire:
Enregistrer un commentaire