|
|
|
|
using HtmlAgilityPack;
|
|
|
|
|
using System;
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
using System.Text;
|
|
|
|
|
using PhilExampleCrawler.Common.Models;
|
|
|
|
|
|
|
|
|
|
namespace PhilExampleCrawler.Core
|
|
|
|
|
{
|
|
|
|
|
public static class ExamplePageValueCrawler
|
|
|
|
|
{
|
|
|
|
|
const string CATEGORIES_URL = "https://page-to-be-crawled.example/categories.html";
|
|
|
|
|
|
|
|
|
|
const string SELECTOR_CATS = "body > li";
|
|
|
|
|
const string SELECTOR_SUBCATS = "ul > li > a";
|
|
|
|
|
|
|
|
|
|
const string ATTR_DATA_HREF = "data-val";
|
|
|
|
|
|
|
|
|
|
public static async Task<IEnumerable<Category>> CrawlExamplePageCategories(HttpClient client)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
var result = await client.GetAsync(CATEGORIES_URL);
|
|
|
|
|
var htmlString = await result.Content.ReadAsStringAsync();
|
|
|
|
|
|
|
|
|
|
if (string.IsNullOrEmpty(htmlString))
|
|
|
|
|
return null; //TODO: LOG ERROR
|
|
|
|
|
|
|
|
|
|
if (!htmlString.StartsWith("<body>"))
|
|
|
|
|
htmlString = "<body>" + htmlString + "</body>";
|
|
|
|
|
|
|
|
|
|
var htmlDoc = new HtmlDocument();
|
|
|
|
|
htmlDoc.LoadHtml(htmlString);
|
|
|
|
|
|
|
|
|
|
IEnumerable<HtmlNode> catNodes = ReadCategoryNodes(htmlDoc);
|
|
|
|
|
Dictionary<HtmlNode, Category> cats = ReadCategoryValues(catNodes);
|
|
|
|
|
ReadAndAddSubcategories(cats);
|
|
|
|
|
return cats.Select(x => x.Value);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
//TODO: LOG ERROR
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private static IEnumerable<HtmlNode> ReadCategoryNodes(HtmlDocument htmlDoc)
|
|
|
|
|
{
|
|
|
|
|
IList<HtmlNode> categories = htmlDoc.QuerySelectorAll(SELECTOR_CATS);
|
|
|
|
|
return categories.Where(n => n != null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static Dictionary<HtmlNode, Category> ReadCategoryValues(IEnumerable<HtmlNode> catNodes)
|
|
|
|
|
{
|
|
|
|
|
Dictionary<HtmlNode, Category> cats = new();
|
|
|
|
|
foreach(HtmlNode catNode in catNodes)
|
|
|
|
|
{
|
|
|
|
|
HtmlNode catATag = catNode.QuerySelector("a");
|
|
|
|
|
if (catATag != null &&
|
|
|
|
|
catATag.HasAttributes &&
|
|
|
|
|
catATag.Attributes.Any(a => a.Name == ATTR_DATA_HREF) &&
|
|
|
|
|
int.TryParse(catATag.Attributes["data-val"].Value, out int catID))
|
|
|
|
|
|
|
|
|
|
cats.Add(catNode, new Category(catATag.InnerText, catID));
|
|
|
|
|
}
|
|
|
|
|
return cats;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void ReadAndAddSubcategories(Dictionary<HtmlNode, Category> cats)
|
|
|
|
|
{
|
|
|
|
|
foreach(var cat in cats)
|
|
|
|
|
{
|
|
|
|
|
var subCatATags = cat.Key.QuerySelectorAll(SELECTOR_SUBCATS);
|
|
|
|
|
foreach (var aTag in subCatATags)
|
|
|
|
|
if (TryGetSubcatID(aTag, out int subCatID))
|
|
|
|
|
cat.Value.SubCategories.Add(new SubCategory(aTag.InnerText, subCatID));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static bool TryGetSubcatID(HtmlNode aTag, out int subCatID)
|
|
|
|
|
{
|
|
|
|
|
subCatID = -1;
|
|
|
|
|
return aTag != null &&
|
|
|
|
|
aTag.HasAttributes &&
|
|
|
|
|
aTag.Attributes.Any(a => a.Name == ATTR_DATA_HREF) &&
|
|
|
|
|
int.TryParse(aTag.Attributes["data-val"].Value, out subCatID);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|