using HtmlAgilityPack; using System; using System.Collections.Generic; using System.Text; using PhilExampleCrawler.Common.Models; namespace PhilExampleCrawler.Core { public static class ExamplePageValueCrawler { const string CATEGORIES_URL = "https://page-to-be-crawled.example/categories.html"; const string SELECTOR_CATS = "body > li"; const string SELECTOR_SUBCATS = "ul > li > a"; const string ATTR_DATA_HREF = "data-val"; public static async Task> CrawlExamplePageCategories(HttpClient client) { try { var result = await client.GetAsync(CATEGORIES_URL); var htmlString = await result.Content.ReadAsStringAsync(); if (string.IsNullOrEmpty(htmlString)) return null; //TODO: LOG ERROR if (!htmlString.StartsWith("")) htmlString = "" + htmlString + ""; var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(htmlString); IEnumerable catNodes = ReadCategoryNodes(htmlDoc); Dictionary cats = ReadCategoryValues(catNodes); ReadAndAddSubcategories(cats); return cats.Select(x => x.Value); } catch (Exception ex) { //TODO: LOG ERROR } return null; } private static IEnumerable ReadCategoryNodes(HtmlDocument htmlDoc) { IList categories = htmlDoc.QuerySelectorAll(SELECTOR_CATS); return categories.Where(n => n != null); } private static Dictionary ReadCategoryValues(IEnumerable catNodes) { Dictionary cats = new(); foreach(HtmlNode catNode in catNodes) { HtmlNode catATag = catNode.QuerySelector("a"); if (catATag != null && catATag.HasAttributes && catATag.Attributes.Any(a => a.Name == ATTR_DATA_HREF) && int.TryParse(catATag.Attributes["data-val"].Value, out int catID)) cats.Add(catNode, new Category(catATag.InnerText, catID)); } return cats; } private static void ReadAndAddSubcategories(Dictionary cats) { foreach(var cat in cats) { var subCatATags = cat.Key.QuerySelectorAll(SELECTOR_SUBCATS); foreach (var aTag in subCatATags) if (TryGetSubcatID(aTag, out int subCatID)) cat.Value.SubCategories.Add(new SubCategory(aTag.InnerText, subCatID)); } } private static bool TryGetSubcatID(HtmlNode aTag, out int subCatID) { subCatID = -1; return aTag != null && aTag.HasAttributes && aTag.Attributes.Any(a => a.Name == ATTR_DATA_HREF) && int.TryParse(aTag.Attributes["data-val"].Value, out subCatID); } } }