blog template detection: if post body is not found on homepage, attempt to template off of post

This commit is contained in:
Nick Vella 2019-07-22 02:10:24 +10:00
parent 4e35f56a55
commit c6952ab4e8
2 changed files with 109 additions and 44 deletions

View File

@ -1,10 +1,17 @@
// Copyright (c) .NET Foundation. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for details.
using mshtml;
using OpenLiveWriter.BlogClient.Clients;
using OpenLiveWriter.Controls;
using OpenLiveWriter.CoreServices;
using OpenLiveWriter.CoreServices.Progress;
using OpenLiveWriter.Extensibility.BlogClient;
using OpenLiveWriter.Localization;
using OpenLiveWriter.Mshtml;
using System;
using System.Collections;
using System.Diagnostics;
using System.ComponentModel;
using System.Drawing;
using System.Globalization;
using System.IO;
@ -12,18 +19,7 @@ using System.Net;
using System.Runtime.InteropServices;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;
using mshtml;
using OpenLiveWriter.BlogClient;
using OpenLiveWriter.BlogClient.Clients;
using OpenLiveWriter.Extensibility.BlogClient;
using OpenLiveWriter.HtmlParser.Parser;
using OpenLiveWriter.Localization;
using OpenLiveWriter.Mshtml;
using OpenLiveWriter.Controls;
using OpenLiveWriter.CoreServices;
using OpenLiveWriter.CoreServices.Progress;
namespace OpenLiveWriter.BlogClient.Detection
{
@ -158,6 +154,8 @@ namespace OpenLiveWriter.BlogClient.Detection
}
private Exception _exception;
private string _nextTryPostUrl;
public object DetectTemplate(IProgressHost progress)
{
// if our context has not been set then just return without doing anything
@ -385,7 +383,7 @@ namespace OpenLiveWriter.BlogClient.Detection
BlogPostRegionLocatorStrategy regionLocatorStrategy = regionLocatorStrategies[i];
try
{
blogTemplateFiles = GetBlogTemplateFiles(progress, regionLocatorStrategy, templateStrategies, targetTemplateTypes);
blogTemplateFiles = GetBlogTemplateFiles(progress, regionLocatorStrategy, templateStrategies, targetTemplateTypes, _blogHomepageUrl);
progress.UpdateProgress(100, 100);
//if any exception occurred along the way, clear them since one of the template strategies
@ -439,8 +437,12 @@ namespace OpenLiveWriter.BlogClient.Detection
/// <param name="regionLocatorStrategy"></param>
/// <param name="templateStrategies"></param>
/// <param name="templateTypes"></param>
/// <param name="targetUrl">
/// The URL to analyze. If a post can be located, but not the body, this is used
/// to reiterate into the post it fetch it's content directly.
/// </param>
/// <returns></returns>
private BlogEditingTemplateFile[] GetBlogTemplateFiles(IProgressHost progress, BlogPostRegionLocatorStrategy regionLocatorStrategy, BlogEditingTemplateStrategy[] templateStrategies, BlogEditingTemplateType[] templateTypes)
private BlogEditingTemplateFile[] GetBlogTemplateFiles(IProgressHost progress, BlogPostRegionLocatorStrategy regionLocatorStrategy, BlogEditingTemplateStrategy[] templateStrategies, BlogEditingTemplateType[] templateTypes, string targetUrl)
{
BlogEditingTemplateFile[] blogTemplateFiles = null;
try
@ -457,10 +459,27 @@ namespace OpenLiveWriter.BlogClient.Detection
CheckCancelRequested(parseTick);
templateStrategy = templateStrategies[i];
// Clear _nextTryPostUrl flag
_nextTryPostUrl = null;
// Parse the blog post HTML into an editing template.
// Note: we can't use MarkupServices to parse the document from a non-UI thread,
// so we have to execute the parsing portion of the template download operation on the UI thread.
string editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5));
string editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5), targetUrl);
// If there's no editing template, there should be a URL to try next
Debug.Assert(editingTemplate != null || (editingTemplate == null && _nextTryPostUrl != null));
// If the homepage has just been analysed and the _nextTryPostUrl flag is set
if (targetUrl == _blogHomepageUrl && _nextTryPostUrl != null && regionLocatorStrategy.CanRefetchPage)
{
// Try fetching the URL that has been specified, and reparse
progress.UpdateProgress("Post contents not present on homepage, checking post..."); // TODO use strings
// Fetch the post page
regionLocatorStrategy.FetchTemporaryPostPage(SilentProgressHost.Instance, _nextTryPostUrl);
// Parse out the template
editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5), _nextTryPostUrl);
}
// check for cancel
CheckCancelRequested(parseTick);
@ -540,19 +559,48 @@ namespace OpenLiveWriter.BlogClient.Detection
/// <param name="uiContext"></param>
/// <param name="progress"></param>
/// <returns></returns>
private string ParseWebpageIntoEditingTemplate_OnUIThread(Control uiContext, BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress)
private string ParseWebpageIntoEditingTemplate_OnUIThread(Control uiContext, BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl)
{
BlogEditingTemplate blogEditingTemplate = (BlogEditingTemplate)uiContext.Invoke(new TemplateParser(ParseBlogPostIntoTemplate), new object[] { regionLocator, new ProgressTick(progress, 1, 100) });
return blogEditingTemplate.Template;
BlogEditingTemplate blogEditingTemplate = (BlogEditingTemplate)uiContext.Invoke(
new TemplateParser(ParseBlogPostIntoTemplate),
new object[] {
regionLocator,
new ProgressTick(progress, 1, 100),
postUrl });
return blogEditingTemplate?.Template;
}
private delegate BlogEditingTemplate TemplateParser(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress);
private delegate BlogEditingTemplate TemplateParser(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl);
private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress)
private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl)
{
progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate));
BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress);
BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress, postUrl);
IHTMLElement primaryTitleRegion = GetPrimaryEditableTitleElement(regions.BodyRegion, regions.Document, regions.TitleRegions);
// IF
// - primaryTitleRegion is not null (title found)
// - BodyRegion is null (no post body found)
// - AND primaryTitleRegion is a link
if (primaryTitleRegion != null && regions.BodyRegion == null && primaryTitleRegion.tagName.ToLower() == "a")
{
// Title region was detected, but body region was not.
// It is possible that only titles are shown on the homepage
// Try requesting the post itself, and loading regions from the post itself
// HACK Somewhere the 'about:' protocol replaces http/https, replace it again with the correct protocol
var pathMatch = new Regex("^about:(.*)$").Match((primaryTitleRegion as IHTMLAnchorElement).href);
Debug.Assert(pathMatch.Success); // Assert that this URL is to the format we expect
var newPostPath = pathMatch.Groups[1].Value; // Grab the path from the URL
var homepageUri = new Uri(_blogHomepageUrl);
var newPostUrl = $"{homepageUri.Scheme}://{homepageUri.Host}{newPostPath}"; // Recreate the full post URL
// Set the NextTryPostUrl flag in the region locater
// This will indicate to the other thread that another page should be parsed
_nextTryPostUrl = newPostUrl;
return null;
}
BlogEditingTemplate template = GenerateBlogTemplate((IHTMLDocument3)regions.Document, primaryTitleRegion, regions.TitleRegions, regions.BodyRegion);
progress.UpdateProgress(100, 100);
@ -696,7 +744,6 @@ namespace OpenLiveWriter.BlogClient.Detection
// return value
private BlogEditingTemplateFile[] _blogTemplateFiles = new BlogEditingTemplateFile[0];
private Color? _postBodyBackgroundColor;
}
public delegate HttpWebResponse PageDownloader(string url, int timeoutMs);

View File

@ -40,6 +40,7 @@ namespace OpenLiveWriter.BlogClient.Detection
protected IBlogCredentialsAccessor _credentials;
protected string _blogHomepageUrl;
protected PageDownloader _pageDownloader;
public BlogPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader)
{
_blogClient = blogClient;
@ -50,9 +51,12 @@ namespace OpenLiveWriter.BlogClient.Detection
}
public abstract void PrepareRegions(IProgressHost progress);
public abstract BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress);
public virtual void FetchTemporaryPostPage(IProgressHost progress, string url) { }
public abstract BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl);
public abstract void CleanupRegions(IProgressHost progress);
public virtual bool CanRefetchPage => false;
protected void CheckCancelRequested(IProgressHost progress)
{
if (progress.CancelRequested)
@ -69,9 +73,11 @@ namespace OpenLiveWriter.BlogClient.Detection
internal class TemporaryPostRegionLocatorStrategy : BlogPostRegionLocatorStrategy
{
BlogPost temporaryPost;
Stream blogHomepageContents;
Stream blogPageContents;
BlogPostRegionLocatorBooleanCallback containsBlogPosts;
public override bool CanRefetchPage => true;
private const string TEMPORARY_POST_STABLE_GUID = "3bfe001a-32de-4114-a6b4-4005b770f6d7";
private string TEMPORARY_POST_BODY_GUID = Guid.NewGuid().ToString();
private string TEMPORARY_POST_TITLE_GUID = Guid.NewGuid().ToString();
@ -112,27 +118,36 @@ namespace OpenLiveWriter.BlogClient.Detection
// Publish a temporary post so that we can examine HTML that will surround posts created with the editor
temporaryPost = PostTemplate(new ProgressTick(progress, 25, 100));
CheckCancelRequested(progress);
FetchTemporaryPostPage(progress, _blogHomepageUrl);
}
blogHomepageContents = new MemoryStream();
/// <summary>
/// Fetch a blog page from the URL specified and transfer it into blogPageContents
/// </summary>
/// <param name="progress"></param>
/// <param name="url"></param>
public override void FetchTemporaryPostPage(IProgressHost progress, string url)
{
blogPageContents = new MemoryStream();
// Download the webpage that is contains the temporary blog post
// WARNING, DownloadBlogPage uses an MSHTML Document on a non-UI thread...which is a no-no!
// its been this way through several betas without problem, so we'll keep it that way for now, but
// it needs to be fixed eventually.
Stream postHtmlContents = DownloadBlogPage(_blogHomepageUrl, progress);
Stream postHtmlContents = DownloadBlogPage(url, progress);
CheckCancelRequested(progress);
using (postHtmlContents)
{
StreamHelper.Transfer(postHtmlContents, blogHomepageContents);
StreamHelper.Transfer(postHtmlContents, blogPageContents);
}
progress.UpdateProgress(100, 100);
}
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress)
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl)
{
blogHomepageContents.Seek(0, SeekOrigin.Begin);
return ParseBlogPostIntoTemplate(blogHomepageContents, _blogHomepageUrl, progress);
blogPageContents.Seek(0, SeekOrigin.Begin);
return ParseBlogPostIntoTemplate(blogPageContents, pageUrl, progress);
}
public override void CleanupRegions(IProgressHost progress)
@ -194,12 +209,12 @@ namespace OpenLiveWriter.BlogClient.Detection
}
/// <summary>
/// Downloads a webpage from a blog.
/// Downloads a webpage from a blog and searches for TEMPORARY_POST_TITLE_GUID.
/// </summary>
/// <param name="blogHomepageUrl"></param>
/// <param name="blogPageUrl"></param>
/// <param name="progress"></param>
/// <returns></returns>
private Stream DownloadBlogPage(string blogHomepageUrl, IProgressHost progress)
/// <returns>Stream containing document which contains TEMPORARY_POST_TITLE_GUID.</returns>
private Stream DownloadBlogPage(string blogPageUrl, IProgressHost progress)
{
ProgressTick tick = new ProgressTick(progress, 50, 100);
MemoryStream memStream = new MemoryStream();
@ -218,14 +233,17 @@ namespace OpenLiveWriter.BlogClient.Detection
// This means we'll try for 5 minutes (10s + 290s = 300s) before we consider the operation timed out.
Thread.Sleep(i < 10 ? 1000 : 10000);
HttpWebResponse resp = _pageDownloader(blogHomepageUrl, 60000);
// Add random parameter to URL to bypass cache
var urlRandom = UrlHelper.AppendQueryParameters(blogPageUrl, new string[] { Guid.NewGuid().ToString() });
HttpWebResponse resp = _pageDownloader(urlRandom, 60000);
memStream = new MemoryStream();
using (Stream respStream = resp.GetResponseStream())
StreamHelper.Transfer(respStream, memStream);
//read in the HTML file and determine if it contains the title element
memStream.Seek(0, SeekOrigin.Begin);
doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, blogHomepageUrl);
doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, urlRandom);
if (HTMLDocumentHelper.FindElementContainingText(doc2, TEMPORARY_POST_TITLE_GUID) == null)
doc2 = null;
}
@ -302,7 +320,7 @@ namespace OpenLiveWriter.BlogClient.Detection
{
private string _titleText;
private string _bodyText;
private MemoryStream blogHomepageContents;
private MemoryStream blogPageContents;
BlogPost mostRecentPost;
private int recentPostCount = -1;
public RecentPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount,
@ -339,13 +357,13 @@ namespace OpenLiveWriter.BlogClient.Detection
if (normalizedTitleText.IndexOf(normalizedBodyText, StringComparison.CurrentCulture) != -1) //body text is a subset of the title text
throw new ArgumentException("Content text is not unique enough to use for style detection");
blogHomepageContents = DownloadBlogPage(_blogHomepageUrl, progress);
blogPageContents = DownloadBlogPage(_blogHomepageUrl, progress);
}
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress)
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl)
{
blogHomepageContents.Seek(0, SeekOrigin.Begin);
IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogHomepageContents, _blogHomepageUrl);
blogPageContents.Seek(0, SeekOrigin.Begin);
IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogPageContents, pageUrl);
// Ensure that the document is fully loaded.
// If it is not fully loaded, then viewing its current style is non-deterministic.
@ -511,10 +529,10 @@ namespace OpenLiveWriter.BlogClient.Detection
public override void CleanupRegions(IProgressHost progress)
{
if (blogHomepageContents != null)
if (blogPageContents != null)
{
blogHomepageContents.Close();
blogHomepageContents = null;
blogPageContents.Close();
blogPageContents = null;
}
progress.UpdateProgress(100, 100);