Skip to content

Commit

Permalink
Remove same_domain
Browse files Browse the repository at this point in the history
  • Loading branch information
spider2048 committed Feb 18, 2024
1 parent f6411f9 commit a3833ef
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 20 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,10 @@ The below snippet is used for defining a profile for the crawler.
same_domain = true # Todo: regex match
```

Refer to the `config.toml` file for more example usages.
Refer to the `config.toml` file for more example usages.

## TODO

- Add Graph frontend
- Add Indexing
- Add search engine
11 changes: 8 additions & 3 deletions crawl.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,19 @@ graph_dir = './graphs'
locations = [
'https://scrapeme.live/shop/'
]

depth = 3
same_domain = true

filter = [
'^https://scrapeme.live/page/.*',
'^http://scrapeme.live/page/.*',
'^https://scrapeme.live/shop/.*/?add-to-cart=.*',
'.*\.png$'
'.*\.png$',
'.*\.jpg$',
'^https://scrapeme.live/product-tag/.*',
'^https://scrapeme.live/product-category/.*',
]

match = [
'^http://scrapeme.live/shop/.*'
'^https://scrapeme.live/*'
]
9 changes: 8 additions & 1 deletion crawler/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from worker import Crawler
from models import *

LOGGING_FORMAT = "[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s"

def main(args):
logging.info("Crawler running in: %s", os.getcwd())
Expand All @@ -23,9 +24,15 @@ def main(args):
filename=crawlopts.log_file,
level=logging.DEBUG,
force=True,
format=LOGGING_FORMAT,
)
else:
logging.basicConfig(filename=crawlopts.log_file, level=logging.INFO, force=True)
logging.basicConfig(
filename=crawlopts.log_file,
level=logging.INFO,
force=True,
format=LOGGING_FORMAT,
)

logger = logging.getLogger("CrawlerMain")
logger.info("Preparing to crawl ...")
Expand Down
24 changes: 9 additions & 15 deletions models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import time
import datetime
from typing import Coroutine, List, Set
from typing import Coroutine, List
import re

Base = declarative_base()
Expand Down Expand Up @@ -60,10 +60,7 @@ def __init__(self, profile_name, profile):
self.profile_name: str = profile_name
self.locations: List[str] = profile["locations"]
self.depth: int = profile["depth"]
self.same_domain: bool = profile["same_domain"]

self.domain: str = urlparse(self.locations[0]).netloc


self.filters: List[re.Pattern] = [
re.compile(pattern) for pattern in profile["filter"]
]
Expand All @@ -76,16 +73,13 @@ def __init__(self, profile_name, profile):
self.tasks: List[Coroutine] = []

def filter(self, url: str) -> bool:
for filter in self.filters:
if re.search(filter, url):
if self.filters:
if any(re.search(filter, url) for filter in self.filters):
return False

for matchp in self.matches:
if re.search(matchp, url):
if self.matches:
if any(re.search(matchp, url) for matchp in self.matches):
return True


if self.same_domain and urlparse(url).netloc != self.domain:
return False

return True
else:
return True

0 comments on commit a3833ef

Please sign in to comment.