1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
Common SQL statements for ht://Check
------------------------------------
Copyright (c) 1999-2004 Comune di Prato - Prato - Italy
Some Portions Copyright (c) 1995-2003 The ht://Dig Group <www.htdig.org>
Author: Gabriele Bartolini - Prato - Italy <angusgb@users.sourceforge.net>
$Id: SQL,v 1.11 2003/12/30 09:38:29 angusgb Exp $
ht://Check is distributed under the GNU General Public License (GPL).
See the COPYING file for license information.
ht://Check is a world-wide-web utility for an intranet or small internet.
-------------------------------------------------------------------
Retrieve all the anchors (A name="anchorname"):
SELECT HtmlAttribute.IDUrl, HtmlAttribute.Content FROM HtmlAttribute,
HtmlStatement WHERE HtmlStatement.IDUrl=HtmlAttribute.IDUrl AND
HtmlAttribute.TagPosition=HtmlStatement.TagPosition AND
HtmlAttribute.Attribute='name' AND HtmlStatement.Tag='a';
Retrieve all the anchor found and not after LinkResult has been set by
the program with the Scheduler::SetLinkResults() method:
SELECT DISTINCT Link.IDUrlSrc, Link.IDUrlDest, Link.TagPosition,
Link.AttrPosition, Link.Anchor, TmpAnchors.IDUrl
FROM Link LEFT JOIN TmpAnchors ON
Link.IDUrlDest=TmpAnchors.IDUrl
AND TmpAnchors.Anchor=Link.Anchor
WHERE Link.Anchor != '' AND Link.LinkResult='OK'
Retrieve all the linked URL:
SELECT Link.*, Url.StatusCode
FROM Link LEFT JOIN Url on Link.IDUrlDest=Url.IDUrl
WHERE Link.LinkResult='NotChecked'
Retrieve all of the groups of LinkResult with the retrieval Status:
SELECT Link.LinkResult, Schedule.Status, COUNT(*)
FROM Link, Schedule
WHERE Link.IDUrlDest=Schedule.IDUrl
GROUP BY Link.LinkResult, Schedule.Status
Retrieve all of the groups of LinkResult:
SELECT Link.LinkResult, COUNT(*)
FROM Link GROUP BY Link.LinkResult
Retrieve all the broken links (1st version - Without redirected URLs, but
with HtmlAttribute too):
SELECT UrlSrc.IDUrl as IDUrlSrc, UrlDest.IDUrl as IDUrlDest,
UrlSrc.Url as UrlSrc, UrlDest.Url as UrlDest,
UrlDest.StatusCode, UrlDest.ReasonPhrase,
UrlDest.ConnStatus, Link.LinkType,
HtmlStatement.Statement, HtmlAttribute.Attribute, HtmlAttribute.Content
FROM Url UrlDest, Url UrlSrc, Link, HtmlStatement, HtmlAttribute
WHERE Link.LinkResult = 'Broken'
AND HtmlStatement.IDUrl = Link.IDUrlSrc
AND HtmlStatement.TagPosition = Link.TagPosition
AND HtmlAttribute.IDUrl = Link.IDUrlSrc
AND HtmlAttribute.TagPosition = Link.TagPosition
AND HtmlAttribute.AttrPosition = Link.AttrPosition
AND UrlSrc.IDUrl = Link.IDUrlSrc
AND UrlDest.IDUrl = Link.IDUrlDest
ORDER BY UrlSrc, UrlDest, Link.TagPosition, Link.AttrPosition
Retrieve all the broken links (2nd version with redirected Urls but withou
HtmlAttribute table):
SELECT UrlSrc.IDUrl as IDUrlSrc, UrlDest.IDUrl as IDUrlDest,
UrlSrc.Url as UrlSrc, UrlDest.Url as UrlDest,
UrlDest.StatusCode, UrlDest.ReasonPhrase,
UrlDest.ConnStatus, Link.LinkType,
HtmlStatement.Statement
FROM Url UrlDest, Url UrlSrc, Link
LEFT JOIN HtmlStatement
ON HtmlStatement.IDUrl = Link.IDUrlSrc
AND HtmlStatement.TagPosition = Link.TagPosition
WHERE Link.LinkResult = 'Broken'
AND UrlSrc.IDUrl = Link.IDUrlSrc
AND UrlDest.IDUrl = Link.IDUrlDest
ORDER BY UrlSrc, UrlDest, Link.TagPosition, Link.AttrPosition
Retrieve all the anchors not found:
SELECT UrlSrc.IDUrl as IDUrlSrc, UrlDest.IDUrl as IDUrlDest,
UrlSrc.Url as UrlSrc, UrlDest.Url as UrlDest,
Link.LinkType, Link.Anchor, HtmlStatement.Statement
FROM Url UrlDest, Url UrlSrc, Link
LEFT JOIN HtmlStatement
ON HtmlStatement.IDUrl = Link.IDUrlSrc
AND HtmlStatement.TagPosition = Link.TagPosition
WHERE Link.LinkResult = 'AnchorNotFound'
AND UrlSrc.IDUrl = Link.IDUrlSrc
AND UrlDest.IDUrl = Link.IDUrlDest
ORDER BY UrlSrc, UrlDest, Link.TagPosition, Link.AttrPosition
Retrieve all the Urls linked to from inside a URL pattern outside
SELECT Source.Url, Dest.Url, Statement
FROM Url Source, Schedule Dest, Link
LEFT JOIN HtmlStatement
ON HtmlStatement.IDUrl = Link.IDUrlSrc
AND HtmlStatement.TagPosition = Link.TagPosition
WHERE Dest.IDUrl = Link.IDUrlDest
AND Source.IDUrl = Link.IDUrlSrc
AND Source.Url like 'pattern%'
AND Dest.Url not like 'pattern%'
Retrieve filenames lengths
SELECT Url, instr(reverse(Url),'/') -1 as FileLength,
instr(reverse(Url),'.') -1 as ExtensionLength,
instr(reverse(Url),'/') - instr(reverse(Url),'.') as NameLength
FROM Url
Retrieve all the documents that don't respect a filename lenght
of 8 chars and an extension bigger than 3 chars (not query string):
SELECT Url
FROM Url
WHERE Url NOT REGEXP '^.*/(([A-Za-z0-9_-]{1,8}\.[A-Za-z]{1,3})?|.*\\?.+)$'
Retrieve all the image reference tags which both found and not found
image alternative files.
SELECT Url.Url, HtmlStatement.Statement, HtmlAttribute.Attribute
FROM HtmlStatement, Url
LEFT JOIN HtmlAttribute ON
HtmlStatement.IDUrl = HtmlAttribute.IDUrl
AND HtmlStatement.TagPosition = HtmlAttribute.TagPosition
AND HtmlAttribute.Attribute = 'ALT'
WHERE Tag='IMG'
AND Url.IDUrl = HtmlStatement.IDUrl
ORDER BY Attribute, Url, HtmlStatement.TagPosition
-------------------------------------------------------------------
|