From c6952ab4e80eefb28149112b728d66bbba8e62f2 Mon Sep 17 00:00:00 2001 From: Nick Vella Date: Mon, 22 Jul 2019 02:10:24 +1000 Subject: [PATCH] blog template detection: if post body is not found on homepage, attempt to template off of post --- .../Detection/BlogEditingTemplateDetector.cs | 91 ++++++++++++++----- .../BlogPostRegionLocatorStrategy.cs | 62 ++++++++----- 2 files changed, 109 insertions(+), 44 deletions(-) diff --git a/src/managed/OpenLiveWriter.BlogClient/Detection/BlogEditingTemplateDetector.cs b/src/managed/OpenLiveWriter.BlogClient/Detection/BlogEditingTemplateDetector.cs index 4ac833f1..b1c75835 100644 --- a/src/managed/OpenLiveWriter.BlogClient/Detection/BlogEditingTemplateDetector.cs +++ b/src/managed/OpenLiveWriter.BlogClient/Detection/BlogEditingTemplateDetector.cs @@ -1,10 +1,17 @@ // Copyright (c) .NET Foundation. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for details. +using mshtml; +using OpenLiveWriter.BlogClient.Clients; +using OpenLiveWriter.Controls; +using OpenLiveWriter.CoreServices; +using OpenLiveWriter.CoreServices.Progress; +using OpenLiveWriter.Extensibility.BlogClient; +using OpenLiveWriter.Localization; +using OpenLiveWriter.Mshtml; using System; using System.Collections; using System.Diagnostics; -using System.ComponentModel; using System.Drawing; using System.Globalization; using System.IO; @@ -12,18 +19,7 @@ using System.Net; using System.Runtime.InteropServices; using System.Text; using System.Text.RegularExpressions; -using System.Threading; using System.Windows.Forms; -using mshtml; -using OpenLiveWriter.BlogClient; -using OpenLiveWriter.BlogClient.Clients; -using OpenLiveWriter.Extensibility.BlogClient; -using OpenLiveWriter.HtmlParser.Parser; -using OpenLiveWriter.Localization; -using OpenLiveWriter.Mshtml; -using OpenLiveWriter.Controls; -using OpenLiveWriter.CoreServices; -using OpenLiveWriter.CoreServices.Progress; namespace OpenLiveWriter.BlogClient.Detection { @@ -158,6 +154,8 @@ namespace OpenLiveWriter.BlogClient.Detection } private Exception _exception; + private string _nextTryPostUrl; + public object DetectTemplate(IProgressHost progress) { // if our context has not been set then just return without doing anything @@ -385,7 +383,7 @@ namespace OpenLiveWriter.BlogClient.Detection BlogPostRegionLocatorStrategy regionLocatorStrategy = regionLocatorStrategies[i]; try { - blogTemplateFiles = GetBlogTemplateFiles(progress, regionLocatorStrategy, templateStrategies, targetTemplateTypes); + blogTemplateFiles = GetBlogTemplateFiles(progress, regionLocatorStrategy, templateStrategies, targetTemplateTypes, _blogHomepageUrl); progress.UpdateProgress(100, 100); //if any exception occurred along the way, clear them since one of the template strategies @@ -439,8 +437,12 @@ namespace OpenLiveWriter.BlogClient.Detection /// /// /// + /// + /// The URL to analyze. If a post can be located, but not the body, this is used + /// to reiterate into the post it fetch it's content directly. + /// /// - private BlogEditingTemplateFile[] GetBlogTemplateFiles(IProgressHost progress, BlogPostRegionLocatorStrategy regionLocatorStrategy, BlogEditingTemplateStrategy[] templateStrategies, BlogEditingTemplateType[] templateTypes) + private BlogEditingTemplateFile[] GetBlogTemplateFiles(IProgressHost progress, BlogPostRegionLocatorStrategy regionLocatorStrategy, BlogEditingTemplateStrategy[] templateStrategies, BlogEditingTemplateType[] templateTypes, string targetUrl) { BlogEditingTemplateFile[] blogTemplateFiles = null; try @@ -457,10 +459,27 @@ namespace OpenLiveWriter.BlogClient.Detection CheckCancelRequested(parseTick); templateStrategy = templateStrategies[i]; + // Clear _nextTryPostUrl flag + _nextTryPostUrl = null; + // Parse the blog post HTML into an editing template. // Note: we can't use MarkupServices to parse the document from a non-UI thread, // so we have to execute the parsing portion of the template download operation on the UI thread. - string editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5)); + string editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5), targetUrl); + + // If there's no editing template, there should be a URL to try next + Debug.Assert(editingTemplate != null || (editingTemplate == null && _nextTryPostUrl != null)); + + // If the homepage has just been analysed and the _nextTryPostUrl flag is set + if (targetUrl == _blogHomepageUrl && _nextTryPostUrl != null && regionLocatorStrategy.CanRefetchPage) + { + // Try fetching the URL that has been specified, and reparse + progress.UpdateProgress("Post contents not present on homepage, checking post..."); // TODO use strings + // Fetch the post page + regionLocatorStrategy.FetchTemporaryPostPage(SilentProgressHost.Instance, _nextTryPostUrl); + // Parse out the template + editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5), _nextTryPostUrl); + } // check for cancel CheckCancelRequested(parseTick); @@ -540,19 +559,48 @@ namespace OpenLiveWriter.BlogClient.Detection /// /// /// - private string ParseWebpageIntoEditingTemplate_OnUIThread(Control uiContext, BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress) + private string ParseWebpageIntoEditingTemplate_OnUIThread(Control uiContext, BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl) { - BlogEditingTemplate blogEditingTemplate = (BlogEditingTemplate)uiContext.Invoke(new TemplateParser(ParseBlogPostIntoTemplate), new object[] { regionLocator, new ProgressTick(progress, 1, 100) }); - return blogEditingTemplate.Template; + BlogEditingTemplate blogEditingTemplate = (BlogEditingTemplate)uiContext.Invoke( + new TemplateParser(ParseBlogPostIntoTemplate), + new object[] { + regionLocator, + new ProgressTick(progress, 1, 100), + postUrl }); + return blogEditingTemplate?.Template; } - private delegate BlogEditingTemplate TemplateParser(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress); + private delegate BlogEditingTemplate TemplateParser(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl); - private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress) + private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl) { progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate)); - BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress); + BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress, postUrl); IHTMLElement primaryTitleRegion = GetPrimaryEditableTitleElement(regions.BodyRegion, regions.Document, regions.TitleRegions); + + // IF + // - primaryTitleRegion is not null (title found) + // - BodyRegion is null (no post body found) + // - AND primaryTitleRegion is a link + if (primaryTitleRegion != null && regions.BodyRegion == null && primaryTitleRegion.tagName.ToLower() == "a") + { + // Title region was detected, but body region was not. + // It is possible that only titles are shown on the homepage + // Try requesting the post itself, and loading regions from the post itself + + // HACK Somewhere the 'about:' protocol replaces http/https, replace it again with the correct protocol + var pathMatch = new Regex("^about:(.*)$").Match((primaryTitleRegion as IHTMLAnchorElement).href); + Debug.Assert(pathMatch.Success); // Assert that this URL is to the format we expect + var newPostPath = pathMatch.Groups[1].Value; // Grab the path from the URL + var homepageUri = new Uri(_blogHomepageUrl); + var newPostUrl = $"{homepageUri.Scheme}://{homepageUri.Host}{newPostPath}"; // Recreate the full post URL + + // Set the NextTryPostUrl flag in the region locater + // This will indicate to the other thread that another page should be parsed + _nextTryPostUrl = newPostUrl; + return null; + } + BlogEditingTemplate template = GenerateBlogTemplate((IHTMLDocument3)regions.Document, primaryTitleRegion, regions.TitleRegions, regions.BodyRegion); progress.UpdateProgress(100, 100); @@ -696,7 +744,6 @@ namespace OpenLiveWriter.BlogClient.Detection // return value private BlogEditingTemplateFile[] _blogTemplateFiles = new BlogEditingTemplateFile[0]; private Color? _postBodyBackgroundColor; - } public delegate HttpWebResponse PageDownloader(string url, int timeoutMs); diff --git a/src/managed/OpenLiveWriter.BlogClient/Detection/BlogPostRegionLocatorStrategy.cs b/src/managed/OpenLiveWriter.BlogClient/Detection/BlogPostRegionLocatorStrategy.cs index 03ac4743..3be9c6bf 100644 --- a/src/managed/OpenLiveWriter.BlogClient/Detection/BlogPostRegionLocatorStrategy.cs +++ b/src/managed/OpenLiveWriter.BlogClient/Detection/BlogPostRegionLocatorStrategy.cs @@ -40,6 +40,7 @@ namespace OpenLiveWriter.BlogClient.Detection protected IBlogCredentialsAccessor _credentials; protected string _blogHomepageUrl; protected PageDownloader _pageDownloader; + public BlogPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader) { _blogClient = blogClient; @@ -50,9 +51,12 @@ namespace OpenLiveWriter.BlogClient.Detection } public abstract void PrepareRegions(IProgressHost progress); - public abstract BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress); + public virtual void FetchTemporaryPostPage(IProgressHost progress, string url) { } + public abstract BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl); public abstract void CleanupRegions(IProgressHost progress); + public virtual bool CanRefetchPage => false; + protected void CheckCancelRequested(IProgressHost progress) { if (progress.CancelRequested) @@ -69,9 +73,11 @@ namespace OpenLiveWriter.BlogClient.Detection internal class TemporaryPostRegionLocatorStrategy : BlogPostRegionLocatorStrategy { BlogPost temporaryPost; - Stream blogHomepageContents; + Stream blogPageContents; BlogPostRegionLocatorBooleanCallback containsBlogPosts; + public override bool CanRefetchPage => true; + private const string TEMPORARY_POST_STABLE_GUID = "3bfe001a-32de-4114-a6b4-4005b770f6d7"; private string TEMPORARY_POST_BODY_GUID = Guid.NewGuid().ToString(); private string TEMPORARY_POST_TITLE_GUID = Guid.NewGuid().ToString(); @@ -112,27 +118,36 @@ namespace OpenLiveWriter.BlogClient.Detection // Publish a temporary post so that we can examine HTML that will surround posts created with the editor temporaryPost = PostTemplate(new ProgressTick(progress, 25, 100)); CheckCancelRequested(progress); + FetchTemporaryPostPage(progress, _blogHomepageUrl); + } - blogHomepageContents = new MemoryStream(); + /// + /// Fetch a blog page from the URL specified and transfer it into blogPageContents + /// + /// + /// + public override void FetchTemporaryPostPage(IProgressHost progress, string url) + { + blogPageContents = new MemoryStream(); // Download the webpage that is contains the temporary blog post // WARNING, DownloadBlogPage uses an MSHTML Document on a non-UI thread...which is a no-no! // its been this way through several betas without problem, so we'll keep it that way for now, but // it needs to be fixed eventually. - Stream postHtmlContents = DownloadBlogPage(_blogHomepageUrl, progress); + Stream postHtmlContents = DownloadBlogPage(url, progress); CheckCancelRequested(progress); using (postHtmlContents) { - StreamHelper.Transfer(postHtmlContents, blogHomepageContents); + StreamHelper.Transfer(postHtmlContents, blogPageContents); } progress.UpdateProgress(100, 100); } - public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress) + public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl) { - blogHomepageContents.Seek(0, SeekOrigin.Begin); - return ParseBlogPostIntoTemplate(blogHomepageContents, _blogHomepageUrl, progress); + blogPageContents.Seek(0, SeekOrigin.Begin); + return ParseBlogPostIntoTemplate(blogPageContents, pageUrl, progress); } public override void CleanupRegions(IProgressHost progress) @@ -194,12 +209,12 @@ namespace OpenLiveWriter.BlogClient.Detection } /// - /// Downloads a webpage from a blog. + /// Downloads a webpage from a blog and searches for TEMPORARY_POST_TITLE_GUID. /// - /// + /// /// - /// - private Stream DownloadBlogPage(string blogHomepageUrl, IProgressHost progress) + /// Stream containing document which contains TEMPORARY_POST_TITLE_GUID. + private Stream DownloadBlogPage(string blogPageUrl, IProgressHost progress) { ProgressTick tick = new ProgressTick(progress, 50, 100); MemoryStream memStream = new MemoryStream(); @@ -218,14 +233,17 @@ namespace OpenLiveWriter.BlogClient.Detection // This means we'll try for 5 minutes (10s + 290s = 300s) before we consider the operation timed out. Thread.Sleep(i < 10 ? 1000 : 10000); - HttpWebResponse resp = _pageDownloader(blogHomepageUrl, 60000); + // Add random parameter to URL to bypass cache + var urlRandom = UrlHelper.AppendQueryParameters(blogPageUrl, new string[] { Guid.NewGuid().ToString() }); + + HttpWebResponse resp = _pageDownloader(urlRandom, 60000); memStream = new MemoryStream(); using (Stream respStream = resp.GetResponseStream()) StreamHelper.Transfer(respStream, memStream); //read in the HTML file and determine if it contains the title element memStream.Seek(0, SeekOrigin.Begin); - doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, blogHomepageUrl); + doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, urlRandom); if (HTMLDocumentHelper.FindElementContainingText(doc2, TEMPORARY_POST_TITLE_GUID) == null) doc2 = null; } @@ -302,7 +320,7 @@ namespace OpenLiveWriter.BlogClient.Detection { private string _titleText; private string _bodyText; - private MemoryStream blogHomepageContents; + private MemoryStream blogPageContents; BlogPost mostRecentPost; private int recentPostCount = -1; public RecentPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, @@ -339,13 +357,13 @@ namespace OpenLiveWriter.BlogClient.Detection if (normalizedTitleText.IndexOf(normalizedBodyText, StringComparison.CurrentCulture) != -1) //body text is a subset of the title text throw new ArgumentException("Content text is not unique enough to use for style detection"); - blogHomepageContents = DownloadBlogPage(_blogHomepageUrl, progress); + blogPageContents = DownloadBlogPage(_blogHomepageUrl, progress); } - public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress) + public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl) { - blogHomepageContents.Seek(0, SeekOrigin.Begin); - IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogHomepageContents, _blogHomepageUrl); + blogPageContents.Seek(0, SeekOrigin.Begin); + IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogPageContents, pageUrl); // Ensure that the document is fully loaded. // If it is not fully loaded, then viewing its current style is non-deterministic. @@ -511,10 +529,10 @@ namespace OpenLiveWriter.BlogClient.Detection public override void CleanupRegions(IProgressHost progress) { - if (blogHomepageContents != null) + if (blogPageContents != null) { - blogHomepageContents.Close(); - blogHomepageContents = null; + blogPageContents.Close(); + blogPageContents = null; } progress.UpdateProgress(100, 100);