vault backup: 2024-01-02 13:42:24
Affected files: Corbia spider.md
This commit is contained in:
parent
0fd245e063
commit
f91903dba3
|
@ -47,12 +47,14 @@ def scrape_domain(self):
|
||||||
|
|
||||||
For each page that will be processed by the domain based function, the `BeautifulSoup` object is accessible via `self.soup`
|
For each page that will be processed by the domain based function, the `BeautifulSoup` object is accessible via `self.soup`
|
||||||
|
|
||||||
|
Consider this example html:
|
||||||
|
|
||||||
```html
|
```html
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>Venmo truffaut shabby chic organic</title>
|
<title>Venmo</title>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="root">
|
<div id="root">
|
||||||
|
@ -93,14 +95,28 @@ For each page that will be processed by the domain based function, the `Beautifu
|
||||||
</html>
|
</html>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
(if you're wondering what is this dummy text, it's coming from [hipsum](https://hipsum.co/), a [Lorem ipsum](https://en.wikipedia.org/wiki/Lorem_ipsum) like generator)
|
||||||
|
|
||||||
|
From this `html`, we would like to retrieve this `json`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"title" : "Venmo truffaut shabby chic organic",
|
||||||
|
"text" : "I'm baby wayfarers tote bag gochujang cred food truck VHS quinoa kogi Brooklyn yr vegan etsy. Portland squid DSA, raclette flannel pinterest craft beer cloud bread pour-over same. Air plant pickled man braid tilde drinking vinegar ascot DIY poke meditation iceland JOMO sustainable. Hell of tbh kombucha +1 listicle.",
|
||||||
|
"success" : True
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def scrape_domain(self):
|
def scrape_domain(self):
|
||||||
try:
|
try:
|
||||||
title = self.soup.find('title')
|
content = self.soup.find("div", class_="article")
|
||||||
|
title = content.find("h2")
|
||||||
|
article_text = content.find("section", class_="text")
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"title" : title.text.strip
|
"title" : title.text.strip(),
|
||||||
|
"text": article_text.text.strip(),
|
||||||
"success" : True
|
"success" : True
|
||||||
}
|
}
|
||||||
return json.dumps(data, indent=4, ensure_ascii=False)
|
return json.dumps(data, indent=4, ensure_ascii=False)
|
||||||
|
|
Loading…
Reference in New Issue