using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Web;
using PhilExampleCrawler.Common.Models;
namespace PhilExampleCrawler.Core
{
internal class BaseCrawler_HAP
{
#region HTML Tags & Selectors
const string SELECTOR_INSERTION = "#srchrslt-adtable > .ad-listitem > article"; //reflects found insertions on the search result page
internal const string SELECTOR_TOPAD = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-topad";
internal const string SELECTOR_HIGHLIGHT = ".aditem-main .aditem-main--top .aditem-main--top--right i.icon-feature-highlight";
internal const string SELECTOR_BOTTOM_TAG = ".aditem-main .aditem-main--bottom p .simpletag.tag-small";
internal const string SELECTOR_LOCATION = ".aditem-main .aditem-main--top .aditem-main--top--left";
internal const string ATTR_DATA_HREF = "data-href";
internal const string BOTTOM_TAG_GESUCH = "GESUCH";
#endregion
readonly HtmlWeb _web = new();
List _cachedHrefs = new();
bool _firstCrawl = true;
bool _crawling = false;
internal event EventHandler OnNewInsertionFound;
//TODO: crawl categories
//TODO: crawl locations
internal void StartCrawling(CrawlSearchParams searchParams, int intervalSec = 10)
{
}
internal void Crawl(CrawlSearchParams searchParams, int timeout_ms)
{
var searchUrl = AsUrl(searchParams);
var searchResultDoc = _web.Load(searchUrl, timeout_ms);
if (searchResultDoc == null)
return;
var insNodes = GetInsertionNodes(searchResultDoc);
CompareToFoundNodes(insNodes);
}
//TODO: Implement CancellationToken
internal void StopCrawling()
{
_crawling = false;
_firstCrawl = true;
_cachedHrefs = new List();
}
private void CompareToFoundNodes(IList insertionNodes)
{
List<(string href, HtmlNode srcNode)> insHrefNodes = GetInsertionHrefs(insertionNodes);
if (_firstCrawl)
{
_cachedHrefs.AddRange(insHrefNodes.Select(x => x.href));
_firstCrawl = false;
}
else
{
foreach ((string href, HtmlNode srcNode) in insHrefNodes)
{
if (!_cachedHrefs.Any(x => x == href))
{
_cachedHrefs.Add(href);
OnNewInsertionFound?.Invoke(this, GetInsertion(srcNode));
}
}
}
}
#region Node Methods
private static IList GetInsertionNodes(HtmlDocument doc)
{
IList insNodes = doc.QuerySelectorAll(SELECTOR_INSERTION);
return insNodes.Where(n => n != null
&& n.HasAttributes
&& n.Attributes.Any(a => a.Name == ATTR_DATA_HREF)).ToList();
}
private static List<(string href, HtmlNode srcNode)> GetInsertionHrefs(IList insertionNodes)
=> insertionNodes.Select(x => (x.Attributes["data-href"].Value, x)).ToList();
private static Insertion GetInsertion(HtmlNode insertionNode)
{
//var i = new Insertion
//{
// Href = insertionNode.Attributes["data-href"].Value,
// IsTopAd = insertionNode.QuerySelector(SELECTOR_TOPAD) != null,
// IsHighlight = insertionNode.QuerySelector(SELECTOR_HIGHLIGHT) != null,
//};
//var plzLoc = GetInsertionLocation(insertionNode);
//Console.WriteLine("Plz: " + plzLoc.plz + " --- " + "Loc: " + plzLoc.loc);
//var reqNode = insertionNode.QuerySelector(SELECTOR_BOTTOM_TAG);
//if (reqNode != null)
// i.IsRequest = reqNode.InnerHtml?.ToUpperInvariant() == BOTTOM_TAG_GESUCH;
//return i;
return null;
}
private static (int plz, string loc) GetInsertionLocation(HtmlNode insertionNode)
{
var locNode = insertionNode.QuerySelector(SELECTOR_LOCATION);
string inner = locNode.InnerHtml;
if (locNode != null && !string.IsNullOrEmpty(inner))
{
if (inner.IndexOf(">") > -1)
inner = inner.Substring(inner.LastIndexOf(">") + 1).Trim();
if(inner.Length >= 5 && int.TryParse(inner.Substring(0, 5), out int plz))
{
return (plz, inner.Substring(5).Trim());
}
}
return (-1, null);
}
const string nums = "0123456789";
private static List GetInsertions(IList insertionNodes)
=> insertionNodes.Select(x => GetInsertion(x)).ToList();
#endregion
private string AsUrl(CrawlSearchParams searchParams)
{
return string.Format(Config.EXAMPLE_SEARCH_URL, HttpUtility.UrlEncode(searchParams.KeyWords),
HttpUtility.UrlEncode(searchParams.LocationStr),
searchParams.CategoryID,
searchParams.Radius);
}
}
}