1
0
Fork 0
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

248 lines
10 KiB

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Web;
using PhilExampleCrawler.Common.Models;
namespace PhilExampleCrawler.Core
{
public class BaseCrawler_Best
{
#region HTML Tags & Selectors
const string SELECTOR_INSERTION = "#srchrslt-adtable > .ad-listitem > article"; //reflects found insertions on the search result page
internal const string SELECTOR_TOPAD = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-topad";
internal const string SELECTOR_HIGHLIGHT = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-highlight";
internal const string SELECTOR_BOTTOM_TAG = ".aditem-main .aditem-main--bottom p .simpletag.tag-small";
internal const string SELECTOR_NAME = ".aditem-main .aditem-main--middle .text-module-begin";
internal const string SELECTOR_DATE = ".aditem-main .aditem-main--top .aditem-main--top--right";
internal const string SELECTOR_LOCATION = ".aditem-main .aditem-main--top .aditem-main--top--left";
internal const string SELECTOR_PRICE = ".aditem-main .aditem-main--middle .aditem-main--middle--price-shipping .aditem-main--middle--price-shipping--price";
internal const string ATTR_DATA_HREF = "data-href";
internal const string BOTTOM_TAG_GESUCH = "GESUCH";
#endregion
private readonly CrawlSession _sess;
private readonly HttpClient _httpClient;
private readonly List<string> _cachedHrefs = new();
private bool _firstCrawl = true;
private string _searchUrl;
public BaseCrawler_Best(CrawlSession crawlSession, HttpClient client)
{
_sess = crawlSession;
_searchUrl = AsUrl(crawlSession);
_httpClient = client;
}
public async Task<List<Insertion>> CrawlAsync()
{
try
{
var result = await _httpClient.GetAsync(_searchUrl);
var htmlString = await result.Content.ReadAsStringAsync();
if (string.IsNullOrEmpty(htmlString))
{
if (result.RequestMessage.RequestUri.AbsoluteUri != _searchUrl)
_searchUrl = result.RequestMessage.RequestUri.AbsoluteUri;
return null; //TODO: LOG ERROR
}
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlString);
var insNodes = GetInsertionNodes(htmlDoc);
return CompareToFoundNodes(insNodes);
}
catch (Exception ex)
{
Console.WriteLine("TODO: ERROR! " + ex);
return null;
}
}
/// <summary> Returns newly found insertions after the first call to this method </summary>
private List<Insertion> CompareToFoundNodes(IList<HtmlNode> insertionNodes)
{
List<(string href, HtmlNode srcNode)> insHrefNodes = GetInsertionHrefs(insertionNodes);
List<Insertion> insertions = new();
if (_firstCrawl)
{
_cachedHrefs.AddRange(insHrefNodes.Select(x => x.href));
_firstCrawl = false;
}
else
{
foreach ((string href, HtmlNode srcNode) in insHrefNodes)
{
if (!_cachedHrefs.Any(x => x == href))
{
_cachedHrefs.Add(href);
var i = GetInsertion(srcNode, _sess.ID);
if (Validate(i))
insertions.Add(i);
}
}
}
return insertions;
}
private bool Validate(Insertion i)
{
if (i.IsRequest || i.IsTopAd || i.IsHighlight)
return false;
if (i.Price < _sess.MinPrice)
return false;
if (i.Price > _sess.MaxPrice)
return false;
return true;
}
#region Node Methods
private static IList<HtmlNode> GetInsertionNodes(HtmlDocument doc)
{
IList<HtmlNode> insNodes = doc.QuerySelectorAll(SELECTOR_INSERTION);
return insNodes.Where(n => n != null
&& n.HasAttributes
&& n.Attributes.Any(a => a.Name == ATTR_DATA_HREF)).ToList();
}
private static List<(string href, HtmlNode srcNode)> GetInsertionHrefs(IList<HtmlNode> insertionNodes)
=> insertionNodes.Select(x => (x.Attributes["data-href"].Value, x)).ToList();
private static Insertion GetInsertion(HtmlNode insertionNode, int sessionID)
{
var (price, is_vb) = GetPriceVB(insertionNode, SELECTOR_PRICE);
var (pC, loc) = GetPostCodeLocation(insertionNode);
//TEST GetInsertionValue(SELECTOR_HIGHLIGHT) != null
var i = new Insertion(href: insertionNode.Attributes["data-href"].Value,
crawlSessionID: sessionID,
name: GetInnerText(insertionNode, SELECTOR_NAME),
postCode: pC,
locationStr: loc,
price: price,
is_vb: is_vb,
date: GetDate(insertionNode, SELECTOR_DATE),
isTopAd: insertionNode.QuerySelector(SELECTOR_TOPAD) != null,
isHighlight: insertionNode.QuerySelector(SELECTOR_HIGHLIGHT) != null,
isRequest: GetInnerText(insertionNode, SELECTOR_BOTTOM_TAG).ToUpperInvariant() == BOTTOM_TAG_GESUCH);
return i;
}
private static (decimal price, bool is_vb) GetPriceVB(HtmlNode insertionNode, string selector)
{
string priceVB = GetInnerText(insertionNode, selector);
bool is_vb = priceVB.Contains("VB");
string priceStr = ReduceToNumeric(priceVB);
if (string.IsNullOrEmpty(priceStr))
return (0, is_vb);
else if (decimal.TryParse(priceStr, NumberStyles.Number, new CultureInfo("de-DE"), out decimal d))
return (d, is_vb);
else
return (0, is_vb);
}
private static (int postCode, string loc) GetPostCodeLocation(HtmlNode insertionNode)
{
string pcLoc = GetInnerText(insertionNode, SELECTOR_LOCATION);
//replaces multiple subsequents whitespaces with a single whitespace
if (!string.IsNullOrEmpty(pcLoc))
{
pcLoc = System.Text.RegularExpressions.Regex.Replace(pcLoc, @"\s+", " ");
if (pcLoc.Length >= 5 && int.TryParse(pcLoc.Substring(0, 5), out int plz))
{
return (plz, pcLoc.Substring(5).Trim());
}
}
return (-1, null);
}
private static string GetInnerText(HtmlNode insertionNode, string selector)
=> (insertionNode.QuerySelector(selector)?.InnerText ?? "")
.Replace("\n", "").Trim();
private static DateTime? GetDate(HtmlNode insertionNode, string selector)
{
/*
Known formats:
- Heute, 09:02
- Gestern, 21:21
- 26.10.2022
*/
string dateText = GetInnerText(insertionNode, selector);
if (!string.IsNullOrEmpty(dateText))
{
int sepaIndex = dateText.IndexOf(", ");
string start = sepaIndex != -1 ? dateText.Substring(0, sepaIndex) : null;
string end = sepaIndex != -1 && sepaIndex < dateText.Length + 1 ? dateText.Substring(sepaIndex + 2) : dateText;
if (start == "Heute" && TimeSpan.TryParseExact(end, "hh\\:mm", CultureInfo.InvariantCulture, out TimeSpan time))
return DateTime.Today.AddMinutes(time.TotalMinutes);
else if (start == "Gestern" && TimeSpan.TryParseExact(end, "hh\\:mm", CultureInfo.InvariantCulture, out time))
return DateTime.Today.AddDays(-1).AddMinutes(time.TotalMinutes);
else if (DateTime.TryParseExact(end, "dd.MM.yyyy", null, DateTimeStyles.None, out DateTime date))
return date;
}
return null;
}
const string ALLOWED_NUMCHARS = "0123456789,.";
private static string ReduceToNumeric(string s)
{
if (string.IsNullOrEmpty(s))
return "";
string n = "";
for (int i = s.Length - 1; i >= 0; i--)
if (ALLOWED_NUMCHARS.Any(c => c == s[i]))
n = s[i] + n;
return n;
}
//private static List<Insertion> GetInsertions(IList<HtmlNode> insertionNodes)
// => insertionNodes.Select(x => GetInsertion(x)).ToList();
#endregion
private static string AsUrl(CrawlSession cs)
=> string.Format(Config.EXAMPLE_SEARCH_URL, HttpUtility.UrlEncode(cs.SearchParams.KeyWords),
HttpUtility.UrlEncode(cs.SearchParams.LocationStr),
cs.SearchParams.CategoryID,
cs.SearchParams.Radius,
cs.MinPrice <= 0 ? "" : cs.MinPrice,
cs.MaxPrice <= 0 ? "" : cs.MaxPrice,
GetPosterType(cs));
private static string GetPosterType(CrawlSession cs)
{
if (cs.IsPrivate && cs.IsCommercial)
return string.Empty;
else if (cs.IsPrivate)
return "PRIVATE";
else if (cs.IsCommercial)
return "COMMERCIAL";
return string.Empty;
}
}
}