You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
248 lines
10 KiB
248 lines
10 KiB
using HtmlAgilityPack;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Globalization;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Text;
|
|
using System.Threading.Tasks;
|
|
using System.Web;
|
|
using PhilExampleCrawler.Common.Models;
|
|
|
|
namespace PhilExampleCrawler.Core
|
|
{
|
|
public class BaseCrawler_Best
|
|
{
|
|
#region HTML Tags & Selectors
|
|
const string SELECTOR_INSERTION = "#srchrslt-adtable > .ad-listitem > article"; //reflects found insertions on the search result page
|
|
|
|
internal const string SELECTOR_TOPAD = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-topad";
|
|
internal const string SELECTOR_HIGHLIGHT = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-highlight";
|
|
internal const string SELECTOR_BOTTOM_TAG = ".aditem-main .aditem-main--bottom p .simpletag.tag-small";
|
|
|
|
internal const string SELECTOR_NAME = ".aditem-main .aditem-main--middle .text-module-begin";
|
|
internal const string SELECTOR_DATE = ".aditem-main .aditem-main--top .aditem-main--top--right";
|
|
internal const string SELECTOR_LOCATION = ".aditem-main .aditem-main--top .aditem-main--top--left";
|
|
internal const string SELECTOR_PRICE = ".aditem-main .aditem-main--middle .aditem-main--middle--price-shipping .aditem-main--middle--price-shipping--price";
|
|
|
|
internal const string ATTR_DATA_HREF = "data-href";
|
|
|
|
internal const string BOTTOM_TAG_GESUCH = "GESUCH";
|
|
#endregion
|
|
|
|
|
|
private readonly CrawlSession _sess;
|
|
private readonly HttpClient _httpClient;
|
|
private readonly List<string> _cachedHrefs = new();
|
|
private bool _firstCrawl = true;
|
|
|
|
private string _searchUrl;
|
|
|
|
public BaseCrawler_Best(CrawlSession crawlSession, HttpClient client)
|
|
{
|
|
_sess = crawlSession;
|
|
_searchUrl = AsUrl(crawlSession);
|
|
_httpClient = client;
|
|
}
|
|
|
|
public async Task<List<Insertion>> CrawlAsync()
|
|
{
|
|
try
|
|
{
|
|
var result = await _httpClient.GetAsync(_searchUrl);
|
|
var htmlString = await result.Content.ReadAsStringAsync();
|
|
|
|
if (string.IsNullOrEmpty(htmlString))
|
|
{
|
|
if (result.RequestMessage.RequestUri.AbsoluteUri != _searchUrl)
|
|
_searchUrl = result.RequestMessage.RequestUri.AbsoluteUri;
|
|
|
|
return null; //TODO: LOG ERROR
|
|
}
|
|
|
|
var htmlDoc = new HtmlDocument();
|
|
htmlDoc.LoadHtml(htmlString);
|
|
|
|
var insNodes = GetInsertionNodes(htmlDoc);
|
|
return CompareToFoundNodes(insNodes);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Console.WriteLine("TODO: ERROR! " + ex);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/// <summary> Returns newly found insertions after the first call to this method </summary>
|
|
private List<Insertion> CompareToFoundNodes(IList<HtmlNode> insertionNodes)
|
|
{
|
|
List<(string href, HtmlNode srcNode)> insHrefNodes = GetInsertionHrefs(insertionNodes);
|
|
List<Insertion> insertions = new();
|
|
|
|
if (_firstCrawl)
|
|
{
|
|
_cachedHrefs.AddRange(insHrefNodes.Select(x => x.href));
|
|
_firstCrawl = false;
|
|
}
|
|
else
|
|
{
|
|
foreach ((string href, HtmlNode srcNode) in insHrefNodes)
|
|
{
|
|
if (!_cachedHrefs.Any(x => x == href))
|
|
{
|
|
_cachedHrefs.Add(href);
|
|
|
|
var i = GetInsertion(srcNode, _sess.ID);
|
|
if (Validate(i))
|
|
insertions.Add(i);
|
|
}
|
|
}
|
|
}
|
|
return insertions;
|
|
}
|
|
|
|
private bool Validate(Insertion i)
|
|
{
|
|
if (i.IsRequest || i.IsTopAd || i.IsHighlight)
|
|
return false;
|
|
if (i.Price < _sess.MinPrice)
|
|
return false;
|
|
if (i.Price > _sess.MaxPrice)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
|
|
#region Node Methods
|
|
private static IList<HtmlNode> GetInsertionNodes(HtmlDocument doc)
|
|
{
|
|
IList<HtmlNode> insNodes = doc.QuerySelectorAll(SELECTOR_INSERTION);
|
|
return insNodes.Where(n => n != null
|
|
&& n.HasAttributes
|
|
&& n.Attributes.Any(a => a.Name == ATTR_DATA_HREF)).ToList();
|
|
}
|
|
|
|
private static List<(string href, HtmlNode srcNode)> GetInsertionHrefs(IList<HtmlNode> insertionNodes)
|
|
=> insertionNodes.Select(x => (x.Attributes["data-href"].Value, x)).ToList();
|
|
|
|
private static Insertion GetInsertion(HtmlNode insertionNode, int sessionID)
|
|
{
|
|
var (price, is_vb) = GetPriceVB(insertionNode, SELECTOR_PRICE);
|
|
var (pC, loc) = GetPostCodeLocation(insertionNode);
|
|
//TEST GetInsertionValue(SELECTOR_HIGHLIGHT) != null
|
|
|
|
var i = new Insertion(href: insertionNode.Attributes["data-href"].Value,
|
|
crawlSessionID: sessionID,
|
|
name: GetInnerText(insertionNode, SELECTOR_NAME),
|
|
postCode: pC,
|
|
locationStr: loc,
|
|
price: price,
|
|
is_vb: is_vb,
|
|
date: GetDate(insertionNode, SELECTOR_DATE),
|
|
isTopAd: insertionNode.QuerySelector(SELECTOR_TOPAD) != null,
|
|
isHighlight: insertionNode.QuerySelector(SELECTOR_HIGHLIGHT) != null,
|
|
isRequest: GetInnerText(insertionNode, SELECTOR_BOTTOM_TAG).ToUpperInvariant() == BOTTOM_TAG_GESUCH);
|
|
return i;
|
|
}
|
|
|
|
private static (decimal price, bool is_vb) GetPriceVB(HtmlNode insertionNode, string selector)
|
|
{
|
|
string priceVB = GetInnerText(insertionNode, selector);
|
|
bool is_vb = priceVB.Contains("VB");
|
|
string priceStr = ReduceToNumeric(priceVB);
|
|
|
|
if (string.IsNullOrEmpty(priceStr))
|
|
return (0, is_vb);
|
|
else if (decimal.TryParse(priceStr, NumberStyles.Number, new CultureInfo("de-DE"), out decimal d))
|
|
return (d, is_vb);
|
|
else
|
|
return (0, is_vb);
|
|
}
|
|
|
|
private static (int postCode, string loc) GetPostCodeLocation(HtmlNode insertionNode)
|
|
{
|
|
string pcLoc = GetInnerText(insertionNode, SELECTOR_LOCATION);
|
|
//replaces multiple subsequents whitespaces with a single whitespace
|
|
|
|
if (!string.IsNullOrEmpty(pcLoc))
|
|
{
|
|
pcLoc = System.Text.RegularExpressions.Regex.Replace(pcLoc, @"\s+", " ");
|
|
if (pcLoc.Length >= 5 && int.TryParse(pcLoc.Substring(0, 5), out int plz))
|
|
{
|
|
return (plz, pcLoc.Substring(5).Trim());
|
|
}
|
|
}
|
|
return (-1, null);
|
|
}
|
|
|
|
private static string GetInnerText(HtmlNode insertionNode, string selector)
|
|
=> (insertionNode.QuerySelector(selector)?.InnerText ?? "")
|
|
.Replace("\n", "").Trim();
|
|
|
|
private static DateTime? GetDate(HtmlNode insertionNode, string selector)
|
|
{
|
|
/*
|
|
Known formats:
|
|
- Heute, 09:02
|
|
- Gestern, 21:21
|
|
- 26.10.2022
|
|
*/
|
|
string dateText = GetInnerText(insertionNode, selector);
|
|
if (!string.IsNullOrEmpty(dateText))
|
|
{
|
|
int sepaIndex = dateText.IndexOf(", ");
|
|
string start = sepaIndex != -1 ? dateText.Substring(0, sepaIndex) : null;
|
|
string end = sepaIndex != -1 && sepaIndex < dateText.Length + 1 ? dateText.Substring(sepaIndex + 2) : dateText;
|
|
|
|
if (start == "Heute" && TimeSpan.TryParseExact(end, "hh\\:mm", CultureInfo.InvariantCulture, out TimeSpan time))
|
|
return DateTime.Today.AddMinutes(time.TotalMinutes);
|
|
else if (start == "Gestern" && TimeSpan.TryParseExact(end, "hh\\:mm", CultureInfo.InvariantCulture, out time))
|
|
return DateTime.Today.AddDays(-1).AddMinutes(time.TotalMinutes);
|
|
else if (DateTime.TryParseExact(end, "dd.MM.yyyy", null, DateTimeStyles.None, out DateTime date))
|
|
return date;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
|
|
const string ALLOWED_NUMCHARS = "0123456789,.";
|
|
private static string ReduceToNumeric(string s)
|
|
{
|
|
if (string.IsNullOrEmpty(s))
|
|
return "";
|
|
|
|
string n = "";
|
|
for (int i = s.Length - 1; i >= 0; i--)
|
|
if (ALLOWED_NUMCHARS.Any(c => c == s[i]))
|
|
n = s[i] + n;
|
|
return n;
|
|
}
|
|
|
|
//private static List<Insertion> GetInsertions(IList<HtmlNode> insertionNodes)
|
|
// => insertionNodes.Select(x => GetInsertion(x)).ToList();
|
|
#endregion
|
|
|
|
|
|
private static string AsUrl(CrawlSession cs)
|
|
=> string.Format(Config.EXAMPLE_SEARCH_URL, HttpUtility.UrlEncode(cs.SearchParams.KeyWords),
|
|
HttpUtility.UrlEncode(cs.SearchParams.LocationStr),
|
|
cs.SearchParams.CategoryID,
|
|
cs.SearchParams.Radius,
|
|
cs.MinPrice <= 0 ? "" : cs.MinPrice,
|
|
cs.MaxPrice <= 0 ? "" : cs.MaxPrice,
|
|
GetPosterType(cs));
|
|
|
|
private static string GetPosterType(CrawlSession cs)
|
|
{
|
|
if (cs.IsPrivate && cs.IsCommercial)
|
|
return string.Empty;
|
|
else if (cs.IsPrivate)
|
|
return "PRIVATE";
|
|
else if (cs.IsCommercial)
|
|
return "COMMERCIAL";
|
|
|
|
return string.Empty;
|
|
}
|
|
}
|
|
}
|