using HtmlAgilityPack; using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using System.Web; using PhilExampleCrawler.Common.Models; namespace PhilExampleCrawler.Core { internal class BaseCrawler_HAP { #region HTML Tags & Selectors const string SELECTOR_INSERTION = "#srchrslt-adtable > .ad-listitem > article"; //reflects found insertions on the search result page internal const string SELECTOR_TOPAD = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-topad"; internal const string SELECTOR_HIGHLIGHT = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-highlight"; internal const string SELECTOR_BOTTOM_TAG = ".aditem-main .aditem-main--bottom p .simpletag.tag-small"; internal const string SELECTOR_LOCATION = ".aditem-main .aditem-main--top .aditem-main--top--left"; internal const string ATTR_DATA_HREF = "data-href"; internal const string BOTTOM_TAG_GESUCH = "GESUCH"; #endregion readonly HtmlWeb _web = new(); List _cachedHrefs = new(); bool _firstCrawl = true; bool _crawling = false; internal event EventHandler OnNewInsertionFound; //TODO: crawl categories //TODO: crawl locations internal void StartCrawling(CrawlSearchParams searchParams, int intervalSec = 10) { } internal void Crawl(CrawlSearchParams searchParams, int timeout_ms) { var searchUrl = AsUrl(searchParams); var searchResultDoc = _web.Load(searchUrl, timeout_ms); if (searchResultDoc == null) return; var insNodes = GetInsertionNodes(searchResultDoc); CompareToFoundNodes(insNodes); } //TODO: Implement CancellationToken internal void StopCrawling() { _crawling = false; _firstCrawl = true; _cachedHrefs = new List(); } private void CompareToFoundNodes(IList insertionNodes) { List<(string href, HtmlNode srcNode)> insHrefNodes = GetInsertionHrefs(insertionNodes); if (_firstCrawl) { _cachedHrefs.AddRange(insHrefNodes.Select(x => x.href)); _firstCrawl = false; } else { foreach ((string href, HtmlNode srcNode) in insHrefNodes) { if (!_cachedHrefs.Any(x => x == href)) { _cachedHrefs.Add(href); OnNewInsertionFound?.Invoke(this, GetInsertion(srcNode)); } } } } #region Node Methods private static IList GetInsertionNodes(HtmlDocument doc) { IList insNodes = doc.QuerySelectorAll(SELECTOR_INSERTION); return insNodes.Where(n => n != null && n.HasAttributes && n.Attributes.Any(a => a.Name == ATTR_DATA_HREF)).ToList(); } private static List<(string href, HtmlNode srcNode)> GetInsertionHrefs(IList insertionNodes) => insertionNodes.Select(x => (x.Attributes["data-href"].Value, x)).ToList(); private static Insertion GetInsertion(HtmlNode insertionNode) { //var i = new Insertion //{ // Href = insertionNode.Attributes["data-href"].Value, // IsTopAd = insertionNode.QuerySelector(SELECTOR_TOPAD) != null, // IsHighlight = insertionNode.QuerySelector(SELECTOR_HIGHLIGHT) != null, //}; //var plzLoc = GetInsertionLocation(insertionNode); //Console.WriteLine("Plz: " + plzLoc.plz + " --- " + "Loc: " + plzLoc.loc); //var reqNode = insertionNode.QuerySelector(SELECTOR_BOTTOM_TAG); //if (reqNode != null) // i.IsRequest = reqNode.InnerHtml?.ToUpperInvariant() == BOTTOM_TAG_GESUCH; //return i; return null; } private static (int plz, string loc) GetInsertionLocation(HtmlNode insertionNode) { var locNode = insertionNode.QuerySelector(SELECTOR_LOCATION); string inner = locNode.InnerHtml; if (locNode != null && !string.IsNullOrEmpty(inner)) { if (inner.IndexOf(">") > -1) inner = inner.Substring(inner.LastIndexOf(">") + 1).Trim(); if(inner.Length >= 5 && int.TryParse(inner.Substring(0, 5), out int plz)) { return (plz, inner.Substring(5).Trim()); } } return (-1, null); } const string nums = "0123456789"; private static List GetInsertions(IList insertionNodes) => insertionNodes.Select(x => GetInsertion(x)).ToList(); #endregion private string AsUrl(CrawlSearchParams searchParams) { return string.Format(Config.EXAMPLE_SEARCH_URL, HttpUtility.UrlEncode(searchParams.KeyWords), HttpUtility.UrlEncode(searchParams.LocationStr), searchParams.CategoryID, searchParams.Radius); } } }