My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
<?php
/**
* Schtrack!
*
* This is the application itself ;)
*
* @author Kaloyan K. Tsvetkov <kaloyan@kaloyan.info>
* @license http://opensource.org/licenses/gpl-license.php GNU Public License
* @link http://code.google.com/p/schtrack2/
*/

///////////////////////////////////////////////////////////////////////////////

/**
* {@link Snoopy} is used to download the files of the mirrored website
* @see Snoopy
*/
include __SCHTRACK__ . './schtrack/class.snoopy.php';

/**
* Schtrack
*
* This class stores the main functionality for this project.
*/
Class schtrack {

/**
* Configuration Settings
*
* These are the default values for the advanced
* settings, so if omitted from the configuration
* script, then this set of values will be used instead.
*
* @var array
*/
var $conf = array(
'save_dir' => './site/',
'ttl' => 2592000, /* 30 days */
);

/**
* An instance of {@link Snoopy}, used for downloading the files
* @var Snoopy
*/
var $sn;

// -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

/**
* Constructor
*
* Sets the configuration settings and initiates a {@link Snoopy} object
*
* @param array $conf
*/
Function schtrack($conf = array()) {
$this->conf = (array) $conf + $this->conf;
$this->sn = new Snoopy;
}

/**
* Executes a download of the required URI from the mirrored website
*/
Function run() {

// figure out what's the required
// URI by stripping the path to
// this application itself
//
$url = '/' . preg_replace(
'~^' . preg_quote($this->conf['self']) . '~',
'', $_SERVER['REQUEST_URI']
);

// get the path where to store the
// cached copy of the downloaded file
//
$path = $this->filename($url);

$fetch = 0;

// create the folder for the site, which is mirrored
//
if (!is_dir($save_dir = dirname($path))) {
mkdir($save_dir, 0777);
}

// a new copy of a file must be fetched
// if it is either not downloaded yet,
// or if the cached copy has expired
//
if (file_exists($path)) {
$s = stat($path);
$fetch = $s['mtime'] + $this->conf['ttl'] < time();
} else {
$fetch = 1;
}

// make sure NOT to download robots.txt!
//
if (false !== strstr($url, 'robots.txt')) {
readfile(__SCHTRACK__ . './schtrack/robots.txt');
exit;
}

$target = 'http://'
. $this->conf['source']
. $url;

// fetch a new copy of the file, if required
//
if ($fetch && $this->sn->fetch($target)) {

@unlink($path);
file_put_contents($path,
$this->cook($target)
);
}

readfile($path);
exit;
}

/**
* Composes the filename for the cached
* copy of a file from the mirrored website
*
* @param string $url
* @return string
*/
Function filename($url) {

$u = parse_url($url) + array(
'query' => null
);
if ($u['query']) {
$u['query'] = '%3F' . rawUrlEncode($u['query']);
// ^
// make sure to encode the question
// mark and the query string (if any)
}

$i = pathinfo($u['path']) + array(
'extension' => 'html',
);
$i['dirname'] = ltrim($i['dirname'], '/\\');

if ($u['query']) {
$u['query'] .= '.' . $i['extension'] ;
// ^
// if there is a query string, make
// sure to append the extension again
}

return $this->conf['save_dir']
. '/' . $this->conf['source'] . '/'
. (!$i['dirname']
? ''
: rawUrlEncode($i['dirname'] . '/')
)

. ($i['basename']
? $i['basename']
: 'index.html'
)

. $u['query']
;
}

/**
* `Cook` fetched data: do a little `cgi-bin` trick, change the
* base-href, fix the absolute paths, and append a foot note
*
* @param string $url
* @return string
*/
Function cook($url) {

$content = $this->sn->results;

$content = str_replace('cgi-bin', ' cgi-bin', $content);
// ^
// stupid hack to get around the
// stupid disadvantage of shared
// hosting, where you can not
// disable the `cgi-bin` ScriptAlias

// it is some sort of text, so ...
//
$_h = join('', $this->sn->headers);
if (false !== stristr($_h, 'Content-Type: text')) {

// .. change the base href, and ...
//
$content = preg_replace('~<body~Uis',
'<base href="http://'
. $_SERVER['HTTP_HOST']
. ltrim($this->conf['self'], '/\\') . '" /><body',
$content
);

// ... fix the absolute paths, then ...
//
$content = preg_replace('~'
. preg_quote('http://'
. $this->conf['source']
) . '~i',
'http://'
. $_SERVER['HTTP_HOST']
. ltrim($this->conf['self'], '/\\'),
$content);

// ... print the footer !
//
$_url = preg_replace('~/[^/]+cgi-bin/~Uis', '/cgi-bin/', $url);
$footer = '<div style="margin: 4px; padding: 2px 4px;
border: solid 1px #b0b000; background:lightyellow; font-family: arial, tahoma,
verdana; position: absolute; top: 0px; left: 0px; display: block; font-size:
11px; cursor:default;">Downloaded by <strong><a style="text-transform:none;"
href="http://code.google.com/p/schtrack2/">Schtrack!</a></strong>
[' . date('r') . '] &rarr; <a href="' . $_url . '" target="_blank"
style="text-transform:none;"><code>' . $_url . '</code></a></div>';
$content .= $footer;
}

return $content;
}

//--end-of-class--
}

Change log

r2 by kaloyan on Jan 26, 2009   Diff
[No log message]
Go to: 
Project members, sign in to write a code review

Older revisions

All revisions of this file

File info

Size: 5466 bytes, 231 lines

File properties

svn:mime-type
text/x-php
svn:keywords
Id Rev Date
Powered by Google Project Hosting