-
-
Notifications
You must be signed in to change notification settings - Fork 79
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
If I try to scrape the following URL, it is successful:
https://www.werkenbijzicht.nl/vacatures.html
If I try to scrape the same page, but with another URL, it returns no result:
https://www.werkenbijzicht.nl/p/1/vacatures.html
It is the same page, but initiated from the pager at the bottom.
So with the original URL, my scraper only does scrape the first 10 items and if I try to go to the next page, I get this new URL like stated above and that results in nothing.
I cannot see what the difference is or how I should fix this?
- core: 3.0.0
class ZichtSpider extends BasicSpider
{
public array $startUrls = [
'https://www.werkenbijzicht.nl/vacatures.html',
];
public array $downloaderMiddleware = [
RequestDeduplicationMiddleware::class,
ExecuteJavascriptMiddleware::class,
[RandomUserAgentMiddleware::class, ['userAgent' => 'Mozilla/5.0 (compatible; RoachPHP/0.1.0)']],
];
public array $spiderMiddleware = [
//
];
public array $itemProcessors = [
//
];
public array $extensions = [
LoggerExtension::class,
StatsCollectorExtension::class,
];
public int $concurrency = 2;
public int $requestDelay = 1;
public function parse(Response $response): Generator
{
$items = $response->filter('.actSResContainer .itemContainer')->each(function (Crawler $node) {
$titleNode = $node->filter('.itemTitle a.cluetips');
$relAttribute = $titleNode->attr('rel'); // Fetching the rel attribute
// Regular expression to extract the ID
$regex = '/id\/(\d+)\//';
$matches = [];
$vacancyId = '';
if (preg_match($regex, $relAttribute, $matches)) {
$vacancyId = $matches[1]; // The first captured group contains the ID
}
return [
'url' => $titleNode->link()->getUri(),
'title' => $titleNode->text(),
'referenceNumber' => $vacancyId,
];
});
foreach ($items as $item) {
yield $this->request('GET', $item['url'], 'parseJob', ['item' => $item]);
}
try {
// Attempt to find the next page link. Adjust the selector as needed.
$nextPageLink = $response->filter('.pageNav a.pnNext');
if ($nextPageLink->count() > 0) {
$nextPageUrl = 'https://www.werkenbijzicht.nl' . $nextPageLink->attr('href');
yield $this->request('GET', $nextPageUrl);
}
} catch (\Exception $e) {
}
}
public function parseJob(Response $response): Generator
{Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working