You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
158 lines
5.5 KiB
158 lines
5.5 KiB
using HtmlAgilityPack;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Text;
|
|
using System.Threading.Tasks;
|
|
using System.Web;
|
|
using PhilExampleCrawler.Common.Models;
|
|
|
|
namespace PhilExampleCrawler.Core
|
|
{
|
|
internal class BaseCrawler_HAP
|
|
{
|
|
#region HTML Tags & Selectors
|
|
const string SELECTOR_INSERTION = "#srchrslt-adtable > .ad-listitem > article"; //reflects found insertions on the search result page
|
|
|
|
internal const string SELECTOR_TOPAD = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-topad";
|
|
internal const string SELECTOR_HIGHLIGHT = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-highlight";
|
|
internal const string SELECTOR_BOTTOM_TAG = ".aditem-main .aditem-main--bottom p .simpletag.tag-small";
|
|
|
|
internal const string SELECTOR_LOCATION = ".aditem-main .aditem-main--top .aditem-main--top--left";
|
|
|
|
internal const string ATTR_DATA_HREF = "data-href";
|
|
|
|
internal const string BOTTOM_TAG_GESUCH = "GESUCH";
|
|
#endregion
|
|
|
|
readonly HtmlWeb _web = new();
|
|
List<string> _cachedHrefs = new();
|
|
bool _firstCrawl = true;
|
|
bool _crawling = false;
|
|
|
|
internal event EventHandler<Insertion> OnNewInsertionFound;
|
|
|
|
|
|
//TODO: crawl categories
|
|
//TODO: crawl locations
|
|
|
|
internal void StartCrawling(CrawlSearchParams searchParams, int intervalSec = 10)
|
|
{
|
|
|
|
}
|
|
|
|
internal void Crawl(CrawlSearchParams searchParams, int timeout_ms)
|
|
{
|
|
var searchUrl = AsUrl(searchParams);
|
|
var searchResultDoc = _web.Load(searchUrl, timeout_ms);
|
|
|
|
if (searchResultDoc == null)
|
|
return;
|
|
|
|
var insNodes = GetInsertionNodes(searchResultDoc);
|
|
CompareToFoundNodes(insNodes);
|
|
}
|
|
|
|
//TODO: Implement CancellationToken
|
|
internal void StopCrawling()
|
|
{
|
|
_crawling = false;
|
|
_firstCrawl = true;
|
|
_cachedHrefs = new List<string>();
|
|
}
|
|
|
|
|
|
|
|
|
|
private void CompareToFoundNodes(IList<HtmlNode> insertionNodes)
|
|
{
|
|
List<(string href, HtmlNode srcNode)> insHrefNodes = GetInsertionHrefs(insertionNodes);
|
|
|
|
if (_firstCrawl)
|
|
{
|
|
_cachedHrefs.AddRange(insHrefNodes.Select(x => x.href));
|
|
_firstCrawl = false;
|
|
}
|
|
else
|
|
{
|
|
foreach ((string href, HtmlNode srcNode) in insHrefNodes)
|
|
{
|
|
if (!_cachedHrefs.Any(x => x == href))
|
|
{
|
|
_cachedHrefs.Add(href);
|
|
OnNewInsertionFound?.Invoke(this, GetInsertion(srcNode));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#region Node Methods
|
|
private static IList<HtmlNode> GetInsertionNodes(HtmlDocument doc)
|
|
{
|
|
IList<HtmlNode> insNodes = doc.QuerySelectorAll(SELECTOR_INSERTION);
|
|
return insNodes.Where(n => n != null
|
|
&& n.HasAttributes
|
|
&& n.Attributes.Any(a => a.Name == ATTR_DATA_HREF)).ToList();
|
|
}
|
|
|
|
private static List<(string href, HtmlNode srcNode)> GetInsertionHrefs(IList<HtmlNode> insertionNodes)
|
|
=> insertionNodes.Select(x => (x.Attributes["data-href"].Value, x)).ToList();
|
|
|
|
private static Insertion GetInsertion(HtmlNode insertionNode)
|
|
{
|
|
//var i = new Insertion
|
|
//{
|
|
// Href = insertionNode.Attributes["data-href"].Value,
|
|
// IsTopAd = insertionNode.QuerySelector(SELECTOR_TOPAD) != null,
|
|
// IsHighlight = insertionNode.QuerySelector(SELECTOR_HIGHLIGHT) != null,
|
|
//};
|
|
|
|
//var plzLoc = GetInsertionLocation(insertionNode);
|
|
//Console.WriteLine("Plz: " + plzLoc.plz + " --- " + "Loc: " + plzLoc.loc);
|
|
|
|
|
|
//var reqNode = insertionNode.QuerySelector(SELECTOR_BOTTOM_TAG);
|
|
//if (reqNode != null)
|
|
// i.IsRequest = reqNode.InnerHtml?.ToUpperInvariant() == BOTTOM_TAG_GESUCH;
|
|
|
|
//return i;
|
|
return null;
|
|
}
|
|
|
|
private static (int plz, string loc) GetInsertionLocation(HtmlNode insertionNode)
|
|
{
|
|
var locNode = insertionNode.QuerySelector(SELECTOR_LOCATION);
|
|
string inner = locNode.InnerHtml;
|
|
|
|
if (locNode != null && !string.IsNullOrEmpty(inner))
|
|
{
|
|
if (inner.IndexOf(">") > -1)
|
|
inner = inner.Substring(inner.LastIndexOf(">") + 1).Trim();
|
|
|
|
if(inner.Length >= 5 && int.TryParse(inner.Substring(0, 5), out int plz))
|
|
{
|
|
return (plz, inner.Substring(5).Trim());
|
|
}
|
|
}
|
|
return (-1, null);
|
|
}
|
|
|
|
const string nums = "0123456789";
|
|
|
|
private static List<Insertion> GetInsertions(IList<HtmlNode> insertionNodes)
|
|
=> insertionNodes.Select(x => GetInsertion(x)).ToList();
|
|
#endregion
|
|
|
|
|
|
private string AsUrl(CrawlSearchParams searchParams)
|
|
{
|
|
return string.Format(Config.EXAMPLE_SEARCH_URL, HttpUtility.UrlEncode(searchParams.KeyWords),
|
|
HttpUtility.UrlEncode(searchParams.LocationStr),
|
|
searchParams.CategoryID,
|
|
searchParams.Radius);
|
|
}
|
|
}
|
|
}
|