blog template detection: if post body is not found on homepage, attempt to template off of post
This commit is contained in:
parent
4e35f56a55
commit
c6952ab4e8
|
@ -1,10 +1,17 @@
|
|||
// Copyright (c) .NET Foundation. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for details.
|
||||
|
||||
using mshtml;
|
||||
using OpenLiveWriter.BlogClient.Clients;
|
||||
using OpenLiveWriter.Controls;
|
||||
using OpenLiveWriter.CoreServices;
|
||||
using OpenLiveWriter.CoreServices.Progress;
|
||||
using OpenLiveWriter.Extensibility.BlogClient;
|
||||
using OpenLiveWriter.Localization;
|
||||
using OpenLiveWriter.Mshtml;
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Diagnostics;
|
||||
using System.ComponentModel;
|
||||
using System.Drawing;
|
||||
using System.Globalization;
|
||||
using System.IO;
|
||||
|
@ -12,18 +19,7 @@ using System.Net;
|
|||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading;
|
||||
using System.Windows.Forms;
|
||||
using mshtml;
|
||||
using OpenLiveWriter.BlogClient;
|
||||
using OpenLiveWriter.BlogClient.Clients;
|
||||
using OpenLiveWriter.Extensibility.BlogClient;
|
||||
using OpenLiveWriter.HtmlParser.Parser;
|
||||
using OpenLiveWriter.Localization;
|
||||
using OpenLiveWriter.Mshtml;
|
||||
using OpenLiveWriter.Controls;
|
||||
using OpenLiveWriter.CoreServices;
|
||||
using OpenLiveWriter.CoreServices.Progress;
|
||||
|
||||
namespace OpenLiveWriter.BlogClient.Detection
|
||||
{
|
||||
|
@ -158,6 +154,8 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
}
|
||||
private Exception _exception;
|
||||
|
||||
private string _nextTryPostUrl;
|
||||
|
||||
public object DetectTemplate(IProgressHost progress)
|
||||
{
|
||||
// if our context has not been set then just return without doing anything
|
||||
|
@ -385,7 +383,7 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
BlogPostRegionLocatorStrategy regionLocatorStrategy = regionLocatorStrategies[i];
|
||||
try
|
||||
{
|
||||
blogTemplateFiles = GetBlogTemplateFiles(progress, regionLocatorStrategy, templateStrategies, targetTemplateTypes);
|
||||
blogTemplateFiles = GetBlogTemplateFiles(progress, regionLocatorStrategy, templateStrategies, targetTemplateTypes, _blogHomepageUrl);
|
||||
progress.UpdateProgress(100, 100);
|
||||
|
||||
//if any exception occurred along the way, clear them since one of the template strategies
|
||||
|
@ -439,8 +437,12 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
/// <param name="regionLocatorStrategy"></param>
|
||||
/// <param name="templateStrategies"></param>
|
||||
/// <param name="templateTypes"></param>
|
||||
/// <param name="targetUrl">
|
||||
/// The URL to analyze. If a post can be located, but not the body, this is used
|
||||
/// to reiterate into the post it fetch it's content directly.
|
||||
/// </param>
|
||||
/// <returns></returns>
|
||||
private BlogEditingTemplateFile[] GetBlogTemplateFiles(IProgressHost progress, BlogPostRegionLocatorStrategy regionLocatorStrategy, BlogEditingTemplateStrategy[] templateStrategies, BlogEditingTemplateType[] templateTypes)
|
||||
private BlogEditingTemplateFile[] GetBlogTemplateFiles(IProgressHost progress, BlogPostRegionLocatorStrategy regionLocatorStrategy, BlogEditingTemplateStrategy[] templateStrategies, BlogEditingTemplateType[] templateTypes, string targetUrl)
|
||||
{
|
||||
BlogEditingTemplateFile[] blogTemplateFiles = null;
|
||||
try
|
||||
|
@ -457,10 +459,27 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
CheckCancelRequested(parseTick);
|
||||
templateStrategy = templateStrategies[i];
|
||||
|
||||
// Clear _nextTryPostUrl flag
|
||||
_nextTryPostUrl = null;
|
||||
|
||||
// Parse the blog post HTML into an editing template.
|
||||
// Note: we can't use MarkupServices to parse the document from a non-UI thread,
|
||||
// so we have to execute the parsing portion of the template download operation on the UI thread.
|
||||
string editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5));
|
||||
string editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5), targetUrl);
|
||||
|
||||
// If there's no editing template, there should be a URL to try next
|
||||
Debug.Assert(editingTemplate != null || (editingTemplate == null && _nextTryPostUrl != null));
|
||||
|
||||
// If the homepage has just been analysed and the _nextTryPostUrl flag is set
|
||||
if (targetUrl == _blogHomepageUrl && _nextTryPostUrl != null && regionLocatorStrategy.CanRefetchPage)
|
||||
{
|
||||
// Try fetching the URL that has been specified, and reparse
|
||||
progress.UpdateProgress("Post contents not present on homepage, checking post..."); // TODO use strings
|
||||
// Fetch the post page
|
||||
regionLocatorStrategy.FetchTemporaryPostPage(SilentProgressHost.Instance, _nextTryPostUrl);
|
||||
// Parse out the template
|
||||
editingTemplate = ParseWebpageIntoEditingTemplate_OnUIThread(_parentControl, regionLocatorStrategy, new ProgressTick(parseTick, 1, 5), _nextTryPostUrl);
|
||||
}
|
||||
|
||||
// check for cancel
|
||||
CheckCancelRequested(parseTick);
|
||||
|
@ -540,19 +559,48 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
/// <param name="uiContext"></param>
|
||||
/// <param name="progress"></param>
|
||||
/// <returns></returns>
|
||||
private string ParseWebpageIntoEditingTemplate_OnUIThread(Control uiContext, BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress)
|
||||
private string ParseWebpageIntoEditingTemplate_OnUIThread(Control uiContext, BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl)
|
||||
{
|
||||
BlogEditingTemplate blogEditingTemplate = (BlogEditingTemplate)uiContext.Invoke(new TemplateParser(ParseBlogPostIntoTemplate), new object[] { regionLocator, new ProgressTick(progress, 1, 100) });
|
||||
return blogEditingTemplate.Template;
|
||||
BlogEditingTemplate blogEditingTemplate = (BlogEditingTemplate)uiContext.Invoke(
|
||||
new TemplateParser(ParseBlogPostIntoTemplate),
|
||||
new object[] {
|
||||
regionLocator,
|
||||
new ProgressTick(progress, 1, 100),
|
||||
postUrl });
|
||||
return blogEditingTemplate?.Template;
|
||||
}
|
||||
private delegate BlogEditingTemplate TemplateParser(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress);
|
||||
private delegate BlogEditingTemplate TemplateParser(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl);
|
||||
|
||||
private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress)
|
||||
private BlogEditingTemplate ParseBlogPostIntoTemplate(BlogPostRegionLocatorStrategy regionLocator, IProgressHost progress, string postUrl)
|
||||
{
|
||||
progress.UpdateProgress(Res.Get(StringId.ProgressCreatingEditingTemplate));
|
||||
|
||||
BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress);
|
||||
BlogPostRegions regions = regionLocator.LocateRegionsOnUIThread(progress, postUrl);
|
||||
IHTMLElement primaryTitleRegion = GetPrimaryEditableTitleElement(regions.BodyRegion, regions.Document, regions.TitleRegions);
|
||||
|
||||
// IF
|
||||
// - primaryTitleRegion is not null (title found)
|
||||
// - BodyRegion is null (no post body found)
|
||||
// - AND primaryTitleRegion is a link
|
||||
if (primaryTitleRegion != null && regions.BodyRegion == null && primaryTitleRegion.tagName.ToLower() == "a")
|
||||
{
|
||||
// Title region was detected, but body region was not.
|
||||
// It is possible that only titles are shown on the homepage
|
||||
// Try requesting the post itself, and loading regions from the post itself
|
||||
|
||||
// HACK Somewhere the 'about:' protocol replaces http/https, replace it again with the correct protocol
|
||||
var pathMatch = new Regex("^about:(.*)$").Match((primaryTitleRegion as IHTMLAnchorElement).href);
|
||||
Debug.Assert(pathMatch.Success); // Assert that this URL is to the format we expect
|
||||
var newPostPath = pathMatch.Groups[1].Value; // Grab the path from the URL
|
||||
var homepageUri = new Uri(_blogHomepageUrl);
|
||||
var newPostUrl = $"{homepageUri.Scheme}://{homepageUri.Host}{newPostPath}"; // Recreate the full post URL
|
||||
|
||||
// Set the NextTryPostUrl flag in the region locater
|
||||
// This will indicate to the other thread that another page should be parsed
|
||||
_nextTryPostUrl = newPostUrl;
|
||||
return null;
|
||||
}
|
||||
|
||||
BlogEditingTemplate template = GenerateBlogTemplate((IHTMLDocument3)regions.Document, primaryTitleRegion, regions.TitleRegions, regions.BodyRegion);
|
||||
|
||||
progress.UpdateProgress(100, 100);
|
||||
|
@ -696,7 +744,6 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
// return value
|
||||
private BlogEditingTemplateFile[] _blogTemplateFiles = new BlogEditingTemplateFile[0];
|
||||
private Color? _postBodyBackgroundColor;
|
||||
|
||||
}
|
||||
|
||||
public delegate HttpWebResponse PageDownloader(string url, int timeoutMs);
|
||||
|
|
|
@ -40,6 +40,7 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
protected IBlogCredentialsAccessor _credentials;
|
||||
protected string _blogHomepageUrl;
|
||||
protected PageDownloader _pageDownloader;
|
||||
|
||||
public BlogPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount, IBlogCredentialsAccessor credentials, string blogHomepageUrl, PageDownloader pageDownloader)
|
||||
{
|
||||
_blogClient = blogClient;
|
||||
|
@ -50,9 +51,12 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
}
|
||||
|
||||
public abstract void PrepareRegions(IProgressHost progress);
|
||||
public abstract BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress);
|
||||
public virtual void FetchTemporaryPostPage(IProgressHost progress, string url) { }
|
||||
public abstract BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl);
|
||||
public abstract void CleanupRegions(IProgressHost progress);
|
||||
|
||||
public virtual bool CanRefetchPage => false;
|
||||
|
||||
protected void CheckCancelRequested(IProgressHost progress)
|
||||
{
|
||||
if (progress.CancelRequested)
|
||||
|
@ -69,9 +73,11 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
internal class TemporaryPostRegionLocatorStrategy : BlogPostRegionLocatorStrategy
|
||||
{
|
||||
BlogPost temporaryPost;
|
||||
Stream blogHomepageContents;
|
||||
Stream blogPageContents;
|
||||
BlogPostRegionLocatorBooleanCallback containsBlogPosts;
|
||||
|
||||
public override bool CanRefetchPage => true;
|
||||
|
||||
private const string TEMPORARY_POST_STABLE_GUID = "3bfe001a-32de-4114-a6b4-4005b770f6d7";
|
||||
private string TEMPORARY_POST_BODY_GUID = Guid.NewGuid().ToString();
|
||||
private string TEMPORARY_POST_TITLE_GUID = Guid.NewGuid().ToString();
|
||||
|
@ -112,27 +118,36 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
// Publish a temporary post so that we can examine HTML that will surround posts created with the editor
|
||||
temporaryPost = PostTemplate(new ProgressTick(progress, 25, 100));
|
||||
CheckCancelRequested(progress);
|
||||
FetchTemporaryPostPage(progress, _blogHomepageUrl);
|
||||
}
|
||||
|
||||
blogHomepageContents = new MemoryStream();
|
||||
/// <summary>
|
||||
/// Fetch a blog page from the URL specified and transfer it into blogPageContents
|
||||
/// </summary>
|
||||
/// <param name="progress"></param>
|
||||
/// <param name="url"></param>
|
||||
public override void FetchTemporaryPostPage(IProgressHost progress, string url)
|
||||
{
|
||||
blogPageContents = new MemoryStream();
|
||||
|
||||
// Download the webpage that is contains the temporary blog post
|
||||
// WARNING, DownloadBlogPage uses an MSHTML Document on a non-UI thread...which is a no-no!
|
||||
// its been this way through several betas without problem, so we'll keep it that way for now, but
|
||||
// it needs to be fixed eventually.
|
||||
Stream postHtmlContents = DownloadBlogPage(_blogHomepageUrl, progress);
|
||||
Stream postHtmlContents = DownloadBlogPage(url, progress);
|
||||
CheckCancelRequested(progress);
|
||||
|
||||
using (postHtmlContents)
|
||||
{
|
||||
StreamHelper.Transfer(postHtmlContents, blogHomepageContents);
|
||||
StreamHelper.Transfer(postHtmlContents, blogPageContents);
|
||||
}
|
||||
progress.UpdateProgress(100, 100);
|
||||
}
|
||||
|
||||
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress)
|
||||
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl)
|
||||
{
|
||||
blogHomepageContents.Seek(0, SeekOrigin.Begin);
|
||||
return ParseBlogPostIntoTemplate(blogHomepageContents, _blogHomepageUrl, progress);
|
||||
blogPageContents.Seek(0, SeekOrigin.Begin);
|
||||
return ParseBlogPostIntoTemplate(blogPageContents, pageUrl, progress);
|
||||
}
|
||||
|
||||
public override void CleanupRegions(IProgressHost progress)
|
||||
|
@ -194,12 +209,12 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// Downloads a webpage from a blog.
|
||||
/// Downloads a webpage from a blog and searches for TEMPORARY_POST_TITLE_GUID.
|
||||
/// </summary>
|
||||
/// <param name="blogHomepageUrl"></param>
|
||||
/// <param name="blogPageUrl"></param>
|
||||
/// <param name="progress"></param>
|
||||
/// <returns></returns>
|
||||
private Stream DownloadBlogPage(string blogHomepageUrl, IProgressHost progress)
|
||||
/// <returns>Stream containing document which contains TEMPORARY_POST_TITLE_GUID.</returns>
|
||||
private Stream DownloadBlogPage(string blogPageUrl, IProgressHost progress)
|
||||
{
|
||||
ProgressTick tick = new ProgressTick(progress, 50, 100);
|
||||
MemoryStream memStream = new MemoryStream();
|
||||
|
@ -218,14 +233,17 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
// This means we'll try for 5 minutes (10s + 290s = 300s) before we consider the operation timed out.
|
||||
Thread.Sleep(i < 10 ? 1000 : 10000);
|
||||
|
||||
HttpWebResponse resp = _pageDownloader(blogHomepageUrl, 60000);
|
||||
// Add random parameter to URL to bypass cache
|
||||
var urlRandom = UrlHelper.AppendQueryParameters(blogPageUrl, new string[] { Guid.NewGuid().ToString() });
|
||||
|
||||
HttpWebResponse resp = _pageDownloader(urlRandom, 60000);
|
||||
memStream = new MemoryStream();
|
||||
using (Stream respStream = resp.GetResponseStream())
|
||||
StreamHelper.Transfer(respStream, memStream);
|
||||
|
||||
//read in the HTML file and determine if it contains the title element
|
||||
memStream.Seek(0, SeekOrigin.Begin);
|
||||
doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, blogHomepageUrl);
|
||||
doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(memStream, urlRandom);
|
||||
if (HTMLDocumentHelper.FindElementContainingText(doc2, TEMPORARY_POST_TITLE_GUID) == null)
|
||||
doc2 = null;
|
||||
}
|
||||
|
@ -302,7 +320,7 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
{
|
||||
private string _titleText;
|
||||
private string _bodyText;
|
||||
private MemoryStream blogHomepageContents;
|
||||
private MemoryStream blogPageContents;
|
||||
BlogPost mostRecentPost;
|
||||
private int recentPostCount = -1;
|
||||
public RecentPostRegionLocatorStrategy(IBlogClient blogClient, BlogAccount blogAccount,
|
||||
|
@ -339,13 +357,13 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
if (normalizedTitleText.IndexOf(normalizedBodyText, StringComparison.CurrentCulture) != -1) //body text is a subset of the title text
|
||||
throw new ArgumentException("Content text is not unique enough to use for style detection");
|
||||
|
||||
blogHomepageContents = DownloadBlogPage(_blogHomepageUrl, progress);
|
||||
blogPageContents = DownloadBlogPage(_blogHomepageUrl, progress);
|
||||
}
|
||||
|
||||
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress)
|
||||
public override BlogPostRegions LocateRegionsOnUIThread(IProgressHost progress, string pageUrl)
|
||||
{
|
||||
blogHomepageContents.Seek(0, SeekOrigin.Begin);
|
||||
IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogHomepageContents, _blogHomepageUrl);
|
||||
blogPageContents.Seek(0, SeekOrigin.Begin);
|
||||
IHTMLDocument2 doc2 = HTMLDocumentHelper.GetHTMLDocumentFromStream(blogPageContents, pageUrl);
|
||||
|
||||
// Ensure that the document is fully loaded.
|
||||
// If it is not fully loaded, then viewing its current style is non-deterministic.
|
||||
|
@ -511,10 +529,10 @@ namespace OpenLiveWriter.BlogClient.Detection
|
|||
|
||||
public override void CleanupRegions(IProgressHost progress)
|
||||
{
|
||||
if (blogHomepageContents != null)
|
||||
if (blogPageContents != null)
|
||||
{
|
||||
blogHomepageContents.Close();
|
||||
blogHomepageContents = null;
|
||||
blogPageContents.Close();
|
||||
blogPageContents = null;
|
||||
}
|
||||
|
||||
progress.UpdateProgress(100, 100);
|
||||
|
|
Loading…
Reference in New Issue