using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Text;
using PhilExampleCrawler.Common.Models;
namespace PhilExampleCrawler.Core
{
public static class ExamplePageValueCrawler
{
const string CATEGORIES_URL = "https://page-to-be-crawled.example/categories.html";
const string SELECTOR_CATS = "body > li";
const string SELECTOR_SUBCATS = "ul > li > a";
const string ATTR_DATA_HREF = "data-val";
public static async Task> CrawlExamplePageCategories(HttpClient client)
{
try
{
var result = await client.GetAsync(CATEGORIES_URL);
var htmlString = await result.Content.ReadAsStringAsync();
if (string.IsNullOrEmpty(htmlString))
return null; //TODO: LOG ERROR
if (!htmlString.StartsWith(""))
htmlString = "" + htmlString + "";
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlString);
IEnumerable catNodes = ReadCategoryNodes(htmlDoc);
Dictionary cats = ReadCategoryValues(catNodes);
ReadAndAddSubcategories(cats);
return cats.Select(x => x.Value);
}
catch (Exception ex)
{
//TODO: LOG ERROR
}
return null;
}
private static IEnumerable ReadCategoryNodes(HtmlDocument htmlDoc)
{
IList categories = htmlDoc.QuerySelectorAll(SELECTOR_CATS);
return categories.Where(n => n != null);
}
private static Dictionary ReadCategoryValues(IEnumerable catNodes)
{
Dictionary cats = new();
foreach(HtmlNode catNode in catNodes)
{
HtmlNode catATag = catNode.QuerySelector("a");
if (catATag != null &&
catATag.HasAttributes &&
catATag.Attributes.Any(a => a.Name == ATTR_DATA_HREF) &&
int.TryParse(catATag.Attributes["data-val"].Value, out int catID))
cats.Add(catNode, new Category(catATag.InnerText, catID));
}
return cats;
}
private static void ReadAndAddSubcategories(Dictionary cats)
{
foreach(var cat in cats)
{
var subCatATags = cat.Key.QuerySelectorAll(SELECTOR_SUBCATS);
foreach (var aTag in subCatATags)
if (TryGetSubcatID(aTag, out int subCatID))
cat.Value.SubCategories.Add(new SubCategory(aTag.InnerText, subCatID));
}
}
private static bool TryGetSubcatID(HtmlNode aTag, out int subCatID)
{
subCatID = -1;
return aTag != null &&
aTag.HasAttributes &&
aTag.Attributes.Any(a => a.Name == ATTR_DATA_HREF) &&
int.TryParse(aTag.Attributes["data-val"].Value, out subCatID);
}
}
}