File: LinkGrep.cs

package info (click to toggle)
mono 6.8.0.105%2Bdfsg-3.3
links: PTS, VCS
area: main
in suites: bookworm
size: 1,284,512 kB
sloc: cs: 11,172,132; xml: 2,850,069; ansic: 671,653; cpp: 122,091; perl: 59,366; javascript: 30,841; asm: 22,168; makefile: 20,093; sh: 15,020; python: 4,827; pascal: 925; sql: 859; sed: 16; php: 1
file content (205 lines) | stat: -rw-r--r-- 8,307 bytes
parent folder | download | duplicates (7)
//------------------------------------------------------------------------------
// <copyright file="LinkGrep.cs" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>                                                                
//------------------------------------------------------------------------------

namespace System.Web.Services.Discovery {

    using System;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Web.Services.Protocols;
    using System.Globalization;
    
    internal class LinkGrep {
        private LinkGrep() { }

        private static string ReadEntireStream(TextReader input) {
            char[] buffer = new char[4096];
            int bufferpos = 0;
            int delta;
    
            // 
            for (;;) {
                delta = input.Read(buffer, bufferpos, buffer.Length - bufferpos);
                
                if (delta == 0)
                    break;
    
                bufferpos += delta;
    
                if (bufferpos == buffer.Length) {
                    char[] newbuf = new char[buffer.Length * 2];
                    System.Array.Copy(buffer, 0, newbuf, 0, buffer.Length);
                    buffer = newbuf;
                }
            }
    
            return new string(buffer, 0, bufferpos);
        }

        internal static string SearchForLink(Stream stream) {
            string text = null;
            text = ReadEntireStream(new StreamReader(stream));

            int textpos = 0;

            Match match;

            if ((match = doctypeDirectiveRegex.Match(text, textpos)).Success) {
                textpos += match.Length;
            }

            bool oneMatch;
            for (;;) {
                
                // Reset match flag
                oneMatch = false;
                
                // 1: scan for text up to the next tag.
    
                // First case: check for whitespace going all the way to the next tag
                
                if ((match = whitespaceRegex.Match(text, textpos)).Success) {
                    oneMatch = true;
                }
                
                // Second case: there may be some nonwhitespace; scan it
                
                else if ((match = textRegex.Match(text, textpos)).Success) {
                    oneMatch = true;
                }
    
                // we might be done now
                
                textpos += match.Length;
                if (textpos == text.Length)
                    break;
               
                // 2: handle constructs that start with <
    
                // First, check to see if it's a tag
    
                if ((match = tagRegex.Match(text, textpos)).Success)
                {
                    oneMatch = true;
                    string tag = match.Groups["tagname"].Value;

                    if (String.Compare(tag, "link", StringComparison.OrdinalIgnoreCase) == 0) {
                        CaptureCollection attrnames = match.Groups["attrname"].Captures;
                        CaptureCollection attrvalues = match.Groups["attrval"].Captures;

                        int count = attrnames.Count;
                        bool rightType = false;
                        bool rightRel = false;
                        string href = null;
                        for (int i = 0; i < count; i++) {
                            string attrName = attrnames[i].ToString();
                            string attrValue = attrvalues[i].ToString();
                            if (String.Compare(attrName, "type", StringComparison.OrdinalIgnoreCase) == 0 &&
                                ContentType.MatchesBase(attrValue, ContentType.TextXml)) {
                                rightType = true;
                            }
                            else if (String.Compare(attrName, "rel", StringComparison.OrdinalIgnoreCase) == 0 &&
                                String.Compare(attrValue, "alternate", StringComparison.OrdinalIgnoreCase) == 0) {
                                rightRel = true;
                            }
                            else if (String.Compare(attrName, "href", StringComparison.OrdinalIgnoreCase) == 0) {
                                href = attrValue;
                            }

                            if (rightType && rightRel && href != null) {
                                // Got a link to a disco file!
                                return href;
                            }
                        }
                    }
                    else if (tag == "body") {
                        // If body begins, get out, since link tags should only be defined in the head
                        break;
                    }

                }
    
                // check to see if it's an end tag
                
                else if ((match = endtagRegex.Match(text, textpos)).Success) {
                    oneMatch = true;
                }
    
                // check to see if it's a comment
    
                else if ((match = commentRegex.Match(text, textpos)).Success) {
                    oneMatch = true;
                }
               
                // we might be done now
                
                textpos += match.Length;

                if (textpos == text.Length)
                    break;

                // If we couldn't get one single match, it means that it's probably not HTML, so bail
                if (!oneMatch) {
                    break;
                }
            }

            return null;
        }
        
        private readonly static Regex tagRegex = new Regex
        (
            "\\G<" +                                    // leading <
            "(?<prefix>[\\w:.-]+(?=:)|):?" +            // optional prefix:
            "(?<tagname>[\\w.-]+)" +                    // tagname
    
            "(?:\\s+" +                                 // zero or more attributes
                "(?<attrprefix>[\\w:.-]+(?=:)|):?" +    //     optional attrprefix:
                "(?<attrname>[\\w.-]+)" +               //     attrname
                "\\s*=\\s*" +                           //     required equals
                "(?:" +                                 //     quoted value
                    "\"(?<attrval>[^\"]*)\"" +          //          double quoted attrval
                    "|'(?<attrval>[^\']*)'" +           //          single quoted attrval
                    "|(?<attrval>[a-zA-Z0-9\\-._:]+)" + //          attrval with no quotes (SGML-approved chars)
                ")" +                                   //     end quoted value
            ")*" +                                      // end attribute
            
            "\\s*(?<empty>/)?>"                         // optional trailing /, and trailing >
        );
            
        private readonly static Regex doctypeDirectiveRegex = new Regex
        (
            @"\G<!doctype\b(([\s\w]+)|("".*""))*>", 
            RegexOptions.IgnoreCase | RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace
        );

        private readonly static Regex endtagRegex = new Regex
        (
            "\\G</" +                                   // leading </
            "(?<prefix>[\\w:-]+(?=:)|):?" +             // optional prefix:
            "(?<tagname>[\\w-]+)\\s*>"                 // tagname
        );
        
        private readonly static Regex commentRegex = new Regex
        (
            "\\G<!--" +                                 // leading <!--
            "(?>[^-]*-)+?" +                            // one or more chunks of text ending with -, minimal
            "->"                                        // trailing ->
    
        );
        
        private readonly static Regex whitespaceRegex = new Regex
        (
            "\\G\\s+" +                                 // at least one char of whitespace
            "(?=<|\\Z)"                                 // ending with either '<' or the end of the string
        );
        
        private readonly static Regex textRegex = new Regex
        (
            "\\G[^<]+"                                  // at least one char on non-'<', maximal
        );
    }
}