21 Sep 2016
I was building a site recently which users could store links to products and pages they like. Presenting those links was boring without the other information like the title, description, keywords and an image.
I could have let the user enter these details manually, but that would make it hard work for them when all they really want to do is add a link to save it and move on. I decided to find a way to get this information from the url automatically, so I came up with this code.
namespace CodeShare.Library.Models.MetaData
{
public class MetaInformation
{
public bool HasData { get; set; }
public string Url { get; set; }
public string Title { get; set; }
public string Description { get; set; }
public string Keywords { get; set; }
public string ImageUrl { get; set; }
public string SiteName { get; set; }
public MetaInformation(string url)
{
Url = url;
HasData = false;
}
public MetaInformation(string url, string title, string description, string keywords, string imageUrl, string siteName)
{
Url = url;
Title = title;
Description = description;
Keywords = keywords;
ImageUrl = imageUrl;
SiteName = siteName;
}
}
}
using HtmlAgilityPack;
namespace CodeShare.Library.MetaData
{
public static class MetaScraper
{
/// <summary>
/// Uses HtmlAgilityPack to get the meta information from a url
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static Models.MetaData.MetaInformation GetMetaDataFromUrl(string url)
{
// Get the URL specified
var webGet = new HtmlWeb();
var document = webGet.Load(url);
var metaTags = document.DocumentNode.SelectNodes("//meta");
Models.MetaData.MetaInformation metaInfo = new Models.MetaData.MetaInformation(url);
if (metaTags != null)
{
int matchCount = 0;
foreach (var tag in metaTags)
{
var tagName = tag.Attributes["name"];
var tagContent = tag.Attributes["content"];
var tagProperty = tag.Attributes["property"];
if (tagName != null && tagContent != null)
{
switch (tagName.Value.ToLower())
{
case "title":
metaInfo.Title = tagContent.Value;
matchCount++;
break;
case "description":
metaInfo.Description = tagContent.Value;
matchCount++;
break;
case "twitter:title":
metaInfo.Title = string.IsNullOrEmpty(metaInfo.Title) ? tagContent.Value : metaInfo.Title;
matchCount++;
break;
case "twitter:description":
metaInfo.Description = string.IsNullOrEmpty(metaInfo.Description) ? tagContent.Value : metaInfo.Description;
matchCount++;
break;
case "keywords":
metaInfo.Keywords = tagContent.Value;
matchCount++;
break;
case "twitter:image":
metaInfo.ImageUrl = string.IsNullOrEmpty(metaInfo.ImageUrl) ? tagContent.Value : metaInfo.ImageUrl;
matchCount++;
break;
}
}
else if (tagProperty != null && tagContent != null)
{
switch (tagProperty.Value.ToLower())
{
case "og:title":
metaInfo.Title = string.IsNullOrEmpty(metaInfo.Title) ? tagContent.Value : metaInfo.Title;
matchCount++;
break;
case "og:description":
metaInfo.Description = string.IsNullOrEmpty(metaInfo.Description) ? tagContent.Value : metaInfo.Description;
matchCount++;
break;
case "og:image":
metaInfo.ImageUrl = string.IsNullOrEmpty(metaInfo.ImageUrl) ? tagContent.Value : metaInfo.ImageUrl;
matchCount++;
break;
}
}
}
metaInfo.HasData = matchCount > 0;
}
return metaInfo;
}
}
}
You can change the case statements to look for the properties you are interested in. This is just my example to get you started.
Hopefully you will find it useful. Feel free to use as you want and share with others.