Why is this useful?

I was building a site recently which users could store links to products and pages they like. Presenting those links was boring without the other information like the title, description, keywords and an image.

I could have let the user enter these details manually, but that would make it hard work for them when all they really want to do is add a link to save it and move on. I decided to find a way to get this information from the url automatically, so I came up with this code.

Here's an example of it being used

A class to hold the information

namespace CodeShare.Library.Models.MetaData
{

public class MetaInformation
{
public bool HasData { get; set; }
public string Url { get; set; }
public string Title { get; set; }
public string Description { get; set; }
public string Keywords { get; set; }
public string ImageUrl { get; set; }
public string SiteName { get; set; }

public MetaInformation(string url)
{
Url = url;
HasData = false;
}

public MetaInformation(string url, string title, string description, string keywords, string imageUrl, string siteName)
{
Url = url;
Title = title;
Description = description;
Keywords = keywords;
ImageUrl = imageUrl;
SiteName = siteName;
}
}
}

The code which does the work

using HtmlAgilityPack;
namespace CodeShare.Library.MetaData
{
    public static class MetaScraper
    {
        /// <summary>
        /// Uses HtmlAgilityPack to get the meta information from a url
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public static Models.MetaData.MetaInformation GetMetaDataFromUrl(string url)
        {
            // Get the URL specified
            var webGet = new HtmlWeb();
            var document = webGet.Load(url);
            var metaTags = document.DocumentNode.SelectNodes("//meta");
            Models.MetaData.MetaInformation metaInfo = new Models.MetaData.MetaInformation(url);
            if (metaTags != null)
            {
                int matchCount = 0;
                foreach (var tag in metaTags)
                {
                    var tagName = tag.Attributes["name"];
                    var tagContent = tag.Attributes["content"];
                    var tagProperty = tag.Attributes["property"];
                    if (tagName != null && tagContent != null)
                    {
                        switch (tagName.Value.ToLower())
                        {
                            case "title":
                                metaInfo.Title = tagContent.Value;
                                matchCount++;
                                break;
                            case "description":
                                metaInfo.Description = tagContent.Value;
                                matchCount++;
                                break;
                            case "twitter:title":
                                metaInfo.Title = string.IsNullOrEmpty(metaInfo.Title) ? tagContent.Value : metaInfo.Title;
                                matchCount++;
                                break;
                            case "twitter:description":
                                metaInfo.Description = string.IsNullOrEmpty(metaInfo.Description) ? tagContent.Value : metaInfo.Description;
                                matchCount++;
                                break;
                            case "keywords":
                                metaInfo.Keywords = tagContent.Value;
                                matchCount++;
                                break;
                            case "twitter:image":
                                metaInfo.ImageUrl = string.IsNullOrEmpty(metaInfo.ImageUrl) ? tagContent.Value : metaInfo.ImageUrl;
                                matchCount++;
                                break;
                        }
                    }
                    else if (tagProperty != null && tagContent != null)
                    {
                        switch (tagProperty.Value.ToLower())
                        {
                            case "og:title":
                                metaInfo.Title = string.IsNullOrEmpty(metaInfo.Title) ? tagContent.Value : metaInfo.Title;
                                matchCount++;
                                break;
                            case "og:description":
                                metaInfo.Description = string.IsNullOrEmpty(metaInfo.Description) ? tagContent.Value : metaInfo.Description;
                                matchCount++;
                                break;
                            case "og:image":
                                metaInfo.ImageUrl = string.IsNullOrEmpty(metaInfo.ImageUrl) ? tagContent.Value : metaInfo.ImageUrl;
                                matchCount++;
                                break;
                        }
                    }
                }
                metaInfo.HasData = matchCount > 0;
            }
            return metaInfo;
        }
    }
}

You can change the case statements to look for the properties you are interested in. This is just my example to get you started.

Hopefully you will find it useful. Feel free to use as you want and share with others.

Paul Seal

Umbraco MVP and .NET Web Developer from Derby (UK) who specialises in building Content Management System (CMS) websites using MVC with Umbraco as a framework. Paul is passionate about web development and programming as a whole. Apart from when he's with his wife and son, if he's not writing code, he's thinking about it or listening to a podcast about it.

Proudly sponsored by

Moriyama

  • Moriyama build, support and deploy Umbraco, Azure and ASP.NET websites and applications.
AppVeyor

  • CI/CD service for Windows, Linux and macOS
  • Build, test, deploy your apps faster, on any platform.
elmah.io

  • elmah.io is the easy error logging and uptime monitoring service for .NET.
  • Take back control of your errors with support for all .NET web and logging frameworks.
uSync Complete

  • uSync.Complete gives you all the uSync packages, allowing you to completely control how your Umbraco settings, content and media is stored, transferred and managed across all your Umbraco Installations.
uSkinned

  • More than a theme for Umbraco CMS, take full control of your content and design with a feature-rich, award-nominated & content editor focused website platform.
UmbHost

  • Affordable, Geo-Redundant, Umbraco hosting which gives back to the community by sponsoring an Umbraco Open Source Developer with each hosting package sold.