File: urlgrab5.php

package info (click to toggle)
php5 5.2.0-8%2Betch1
  • links: PTS
  • area: main
  • in suites: etch-m68k
  • size: 58,836 kB
  • ctags: 45,575
  • sloc: ansic: 535,107; sh: 17,819; php: 11,336; cpp: 4,289; xml: 3,861; yacc: 2,446; lex: 2,174; makefile: 1,150; tcl: 1,128; awk: 693; perl: 71; sql: 22; pascal: 15
file content (39 lines) | stat: -rw-r--r-- 820 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
<?php
    /*
     * urlgrab5.php
     *
     * A simple command-line utility to extract all of the URLS contained
     * within <A HREF> tags from a document.
     *
     * NOTE: Only works with tidy for PHP 5, please see urlgrab.php for tidy for PHP 4.3.x
     *
     * By: John Coggeshall <john@php.net>
     *
     * Usage: php urlgrab5.php <file>
     *
     */
    function dump_nodes(tidyNode $node, &$urls = NULL) {

	$urls = (is_array($urls)) ? $urls : array();
	
	if(isset($node->id)) {
	    if($node->id == TIDY_TAG_A) {
		$urls[] = $node->attribute['href'];
	    }
	}
		    
	if($node->hasChildren()) {

	    foreach($node->child as $c) {
		dump_nodes($c, $urls);
	    }

	}
	
	return $urls;
    }

    $a = tidy_parse_file($_SERVER['argv'][1]);
    $a->cleanRepair();
    print_r(dump_nodes($a->html()));
?>