using HtmlAgilityPack; using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using System.Web; using PhilExampleCrawler.Common.Models; namespace PhilExampleCrawler.Core { public class BaseCrawler_Best { #region HTML Tags & Selectors const string SELECTOR_INSERTION = "#srchrslt-adtable > .ad-listitem > article"; //reflects found insertions on the search result page internal const string SELECTOR_TOPAD = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-topad"; internal const string SELECTOR_HIGHLIGHT = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-highlight"; internal const string SELECTOR_BOTTOM_TAG = ".aditem-main .aditem-main--bottom p .simpletag.tag-small"; internal const string SELECTOR_NAME = ".aditem-main .aditem-main--middle .text-module-begin"; internal const string SELECTOR_DATE = ".aditem-main .aditem-main--top .aditem-main--top--right"; internal const string SELECTOR_LOCATION = ".aditem-main .aditem-main--top .aditem-main--top--left"; internal const string SELECTOR_PRICE = ".aditem-main .aditem-main--middle .aditem-main--middle--price-shipping .aditem-main--middle--price-shipping--price"; internal const string ATTR_DATA_HREF = "data-href"; internal const string BOTTOM_TAG_GESUCH = "GESUCH"; #endregion private readonly CrawlSession _sess; private readonly HttpClient _httpClient; private readonly List _cachedHrefs = new(); private bool _firstCrawl = true; private string _searchUrl; public BaseCrawler_Best(CrawlSession crawlSession, HttpClient client) { _sess = crawlSession; _searchUrl = AsUrl(crawlSession); _httpClient = client; } public async Task> CrawlAsync() { try { var result = await _httpClient.GetAsync(_searchUrl); var htmlString = await result.Content.ReadAsStringAsync(); if (string.IsNullOrEmpty(htmlString)) { if (result.RequestMessage.RequestUri.AbsoluteUri != _searchUrl) _searchUrl = result.RequestMessage.RequestUri.AbsoluteUri; return null; //TODO: LOG ERROR } var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlString); var insNodes = GetInsertionNodes(htmlDoc); return CompareToFoundNodes(insNodes); } catch (Exception ex) { Console.WriteLine("TODO: ERROR! " + ex); return null; } } /// Returns newly found insertions after the first call to this method private List CompareToFoundNodes(IList insertionNodes) { List<(string href, HtmlNode srcNode)> insHrefNodes = GetInsertionHrefs(insertionNodes); List insertions = new(); if (_firstCrawl) { _cachedHrefs.AddRange(insHrefNodes.Select(x => x.href)); _firstCrawl = false; } else { foreach ((string href, HtmlNode srcNode) in insHrefNodes) { if (!_cachedHrefs.Any(x => x == href)) { _cachedHrefs.Add(href); var i = GetInsertion(srcNode, _sess.ID); if (Validate(i)) insertions.Add(i); } } } return insertions; } private bool Validate(Insertion i) { if (i.IsRequest || i.IsTopAd || i.IsHighlight) return false; if (i.Price < _sess.MinPrice) return false; if (i.Price > _sess.MaxPrice) return false; return true; } #region Node Methods private static IList GetInsertionNodes(HtmlDocument doc) { IList insNodes = doc.QuerySelectorAll(SELECTOR_INSERTION); return insNodes.Where(n => n != null && n.HasAttributes && n.Attributes.Any(a => a.Name == ATTR_DATA_HREF)).ToList(); } private static List<(string href, HtmlNode srcNode)> GetInsertionHrefs(IList insertionNodes) => insertionNodes.Select(x => (x.Attributes["data-href"].Value, x)).ToList(); private static Insertion GetInsertion(HtmlNode insertionNode, int sessionID) { var (price, is_vb) = GetPriceVB(insertionNode, SELECTOR_PRICE); var (pC, loc) = GetPostCodeLocation(insertionNode); //TEST GetInsertionValue(SELECTOR_HIGHLIGHT) != null var i = new Insertion(href: insertionNode.Attributes["data-href"].Value, crawlSessionID: sessionID, name: GetInnerText(insertionNode, SELECTOR_NAME), postCode: pC, locationStr: loc, price: price, is_vb: is_vb, date: GetDate(insertionNode, SELECTOR_DATE), isTopAd: insertionNode.QuerySelector(SELECTOR_TOPAD) != null, isHighlight: insertionNode.QuerySelector(SELECTOR_HIGHLIGHT) != null, isRequest: GetInnerText(insertionNode, SELECTOR_BOTTOM_TAG).ToUpperInvariant() == BOTTOM_TAG_GESUCH); return i; } private static (decimal price, bool is_vb) GetPriceVB(HtmlNode insertionNode, string selector) { string priceVB = GetInnerText(insertionNode, selector); bool is_vb = priceVB.Contains("VB"); string priceStr = ReduceToNumeric(priceVB); if (string.IsNullOrEmpty(priceStr)) return (0, is_vb); else if (decimal.TryParse(priceStr, NumberStyles.Number, new CultureInfo("de-DE"), out decimal d)) return (d, is_vb); else return (0, is_vb); } private static (int postCode, string loc) GetPostCodeLocation(HtmlNode insertionNode) { string pcLoc = GetInnerText(insertionNode, SELECTOR_LOCATION); //replaces multiple subsequents whitespaces with a single whitespace if (!string.IsNullOrEmpty(pcLoc)) { pcLoc = System.Text.RegularExpressions.Regex.Replace(pcLoc, @"\s+", " "); if (pcLoc.Length >= 5 && int.TryParse(pcLoc.Substring(0, 5), out int plz)) { return (plz, pcLoc.Substring(5).Trim()); } } return (-1, null); } private static string GetInnerText(HtmlNode insertionNode, string selector) => (insertionNode.QuerySelector(selector)?.InnerText ?? "") .Replace("\n", "").Trim(); private static DateTime? GetDate(HtmlNode insertionNode, string selector) { /* Known formats: - Heute, 09:02 - Gestern, 21:21 - 26.10.2022 */ string dateText = GetInnerText(insertionNode, selector); if (!string.IsNullOrEmpty(dateText)) { int sepaIndex = dateText.IndexOf(", "); string start = sepaIndex != -1 ? dateText.Substring(0, sepaIndex) : null; string end = sepaIndex != -1 && sepaIndex < dateText.Length + 1 ? dateText.Substring(sepaIndex + 2) : dateText; if (start == "Heute" && TimeSpan.TryParseExact(end, "hh\\:mm", CultureInfo.InvariantCulture, out TimeSpan time)) return DateTime.Today.AddMinutes(time.TotalMinutes); else if (start == "Gestern" && TimeSpan.TryParseExact(end, "hh\\:mm", CultureInfo.InvariantCulture, out time)) return DateTime.Today.AddDays(-1).AddMinutes(time.TotalMinutes); else if (DateTime.TryParseExact(end, "dd.MM.yyyy", null, DateTimeStyles.None, out DateTime date)) return date; } return null; } const string ALLOWED_NUMCHARS = "0123456789,."; private static string ReduceToNumeric(string s) { if (string.IsNullOrEmpty(s)) return ""; string n = ""; for (int i = s.Length - 1; i >= 0; i--) if (ALLOWED_NUMCHARS.Any(c => c == s[i])) n = s[i] + n; return n; } //private static List GetInsertions(IList insertionNodes) // => insertionNodes.Select(x => GetInsertion(x)).ToList(); #endregion private static string AsUrl(CrawlSession cs) => string.Format(Config.EXAMPLE_SEARCH_URL, HttpUtility.UrlEncode(cs.SearchParams.KeyWords), HttpUtility.UrlEncode(cs.SearchParams.LocationStr), cs.SearchParams.CategoryID, cs.SearchParams.Radius, cs.MinPrice <= 0 ? "" : cs.MinPrice, cs.MaxPrice <= 0 ? "" : cs.MaxPrice, GetPosterType(cs)); private static string GetPosterType(CrawlSession cs) { if (cs.IsPrivate && cs.IsCommercial) return string.Empty; else if (cs.IsPrivate) return "PRIVATE"; else if (cs.IsCommercial) return "COMMERCIAL"; return string.Empty; } } }