diff --git a/Corbia spider.md b/Corbia spider.md index bb33c9a..64d7779 100644 --- a/Corbia spider.md +++ b/Corbia spider.md @@ -47,12 +47,14 @@ def scrape_domain(self): For each page that will be processed by the domain based function, the `BeautifulSoup` object is accessible via `self.soup` +Consider this example html: + ```html - Venmo truffaut shabby chic organic + Venmo
@@ -93,14 +95,28 @@ For each page that will be processed by the domain based function, the `Beautifu ``` +(if you're wondering what is this dummy text, it's coming from [hipsum](https://hipsum.co/), a [Lorem ipsum](https://en.wikipedia.org/wiki/Lorem_ipsum) like generator) + +From this `html`, we would like to retrieve this `json`: +```json +{ + "title" : "Venmo truffaut shabby chic organic", + "text" : "I'm baby wayfarers tote bag gochujang cred food truck VHS quinoa kogi Brooklyn yr vegan etsy. Portland squid DSA, raclette flannel pinterest craft beer cloud bread pour-over same. Air plant pickled man braid tilde drinking vinegar ascot DIY poke meditation iceland JOMO sustainable. Hell of tbh kombucha +1 listicle.", + "success" : True +} +``` ```python def scrape_domain(self): try: - title = self.soup.find('title') + content = self.soup.find("div", class_="article") + title = content.find("h2") + article_text = content.find("section", class_="text") + data = { - "title" : title.text.strip + "title" : title.text.strip(), + "text": article_text.text.strip(), "success" : True } return json.dumps(data, indent=4, ensure_ascii=False)