Not sure what the official way of contributing code changes is (I'm assuming you actually want contributions..) so I'll try this approach:
I made a few additions to xml_sitemap that should be general enough to be of interest to others as well:
- Added an option of using the robots.txt Disallow directives to filter entries from the generated sitemap as well. This might not be for everyone, but I actually use my sitemap for human interaction as well, and normally I don't want my users to browse directly to the kind of pages that I hide from robots. (post result pages, validation results, search results etc)
- Added an option of explicitly specify additional entries to put in the generated sitemap xml. Just a simple comma separated lists of additional URIs. I use these to include things that should be accessed through fancy rewritten URLs (e.g. different named filtered blog categories)
Both features are disabled by default, and no config upgrade is required.
Patch file against HEAD (file rev 2690) would be attached, but upload of any useful file types seems to be disallowed..
- Code: Select all
--- admin_xml_sitemap.php.2690 2010-09-12 23:39:24.714894200 +0200
+++ admin_xml_sitemap.php.mine 2010-09-12 23:46:27.189933000 +0200
@@ -1,11 +1,11 @@
<?php
// - Extension: XML Sitemap
-// - Version: 1.0.4
+// - Version: 1.0.5
// - Author: PivotX Team
// - Email: admin@pivotx.net
// - Site: http://www.pivotx.net
// - Description: An extension to provide a XML sitemap (for search engines).
-// - Date: 2010-05-12
+// - Date: 2010-09-08
// - Identifier: xml_sitemap
global $xml_sitemap_config;
@@ -15,6 +15,8 @@
'xml_sitemap_include_entries' => 1,
'xml_sitemap_include_pages' => 1,
'xml_sitemap_content_type' => 'text/xml',
+ 'xml_sitemap_filter_as_any_robot' => 0,
+ 'xml_sitemap_additional_uris' => '',
);
@@ -63,6 +65,24 @@
'label' => __('Include pages'),
));
+ $form->add( array(
+ 'type' => 'checkbox',
+ 'name' => 'xml_sitemap_filter_as_any_robot',
+ 'label' => __('Filter as any robot'),
+ 'text' => makeJtip(__('Filter as any robot'), __('Test each sitemap entry against all Disallow rules in robots.txt. User-agent qualifiers are ignored, _every_ Disallow will try to contribute to URI exclusion.')),
+ ));
+
+ $form->add( array(
+ 'type' => 'textarea',
+ 'name' => 'xml_sitemap_additional_uris',
+ 'label' => __('Additional URIs'),
+ 'text' => makeJtip(__('Additional URIs'), __('Additional URIs to include in sitemap. Separate entries with a comma. Entries should be absolute paths (i.e. /myuri/here). Last modified will always be set to \'now()\'')),
+ 'value' => '',
+ 'rows' => 6,
+ 'cols' => 60,
+ 'isrequired' => 0
+ ));
+
/**
* Add the form to our (referenced) $form_html. Make sure you use the same key
* as the first parameter to $PIVOTX['extensions']->getAdminForm
@@ -72,6 +92,61 @@
}
/**
+ * Find suitable disallows from robots.txt. Assumes pivotx is installed
+ * at domain root (i.e. robots.txt resides in site_path)
+ */
+function xml_sitemap_parse_robots_txt() {
+ global $xml_sitemap_config, $PIVOTX;
+ if(!$xml_sitemap_config['xml_sitemap_filter_as_any_robot']) {
+ debug('xml_sitemap: is not using robots.txt');
+ return FALSE;
+ }
+
+ $robot_txt = $PIVOTX['paths']['site_path'] . 'robots.txt';
+ debug('xml_sitemap: is looking for ' . $robot_txt);
+
+ $entries = @file_get_contents($robot_txt);
+ if($entries) {
+ preg_match_all('/Disallow:\s*(.*)/', $entries, $disallows, PREG_PATTERN_ORDER);
+ return $disallows[1];
+ }
+
+ return FALSE;
+}
+
+/**
+ * Test for uri exclusion by robot disallowed settings.
+ */
+function xml_sitemap_is_uri_allowed($disallows, $uri) {
+ if($disallows) {
+ foreach($disallows as $disallow) {
+ if(strpos($uri, $disallow) === 0)
+ return FALSE;
+ }
+ }
+ return TRUE;
+}
+
+/**
+ * Add explicitly specified URIs to sitemap.
+ */
+function xml_sitemap_create_additional($template, &$items) {
+ global $xml_sitemap_config, $PIVOTX;
+
+ $uris = $xml_sitemap_config['xml_sitemap_additional_uris'];
+ if($uris) {
+ $uris = explode(',', $uris);
+ foreach($uris as $uri) {
+ $item = $template;
+ $item = str_replace('%loc%', $uri, $item);
+ $item = str_replace('%lastmod%', formatDate('', '%year%-%month%-%day%'), $item);
+ $items[] = $item;
+ debug('xml_sitemap: added additional URI ' . $uri);
+ }
+ }
+}
+
+/**
* Generates the XML sitemap.
*/
function xml_sitemapHook(&$params) {
@@ -106,18 +181,20 @@
EOM;
$configdata = $PIVOTX['config']->getConfigArray();
- foreach ($xml_sitemap_config as $key => $value) {
+ foreach ($xml_sitemap_config as $key => &$value) {
if (isset($configdata[$key])) {
- $$key = $configdata[$key];
- } else {
- $$key = $value;
- }
+ $value = $configdata[$key];
+ }
+ $$key = $value;
}
$frontpages = array();
$items = array();
$links = array();
+ $disallows = xml_sitemap_parse_robots_txt();
+ debug('xml_sitemap: active robot disallows: [' . ($disallows ? implode(', ', $disallows) : 'none').']');
+
// Handle the frontpages (site_root and weblog frontpages)
$frontpage = $output_frontpage;
$frontpage = str_replace('%loc%', $PIVOTX['paths']['site_url'], $frontpage);
@@ -126,8 +203,15 @@
$weblogs = $PIVOTX['weblogs']->getWeblogNames();
foreach ($weblogs as $weblog) {
$frontpage = $output_frontpage;
- $frontpage = str_replace('%loc%', $PIVOTX['weblogs']->get($weblog,'link'), $frontpage);
- $frontpages[] = $frontpage;
+ $link = $PIVOTX['weblogs']->get($weblog,'link');
+ //debug('xml_sitemap: frontpage link is ' . $link);
+ if(xml_sitemap_is_uri_allowed($disallows, $link)) {
+ $frontpage = str_replace('%loc%', $link, $frontpage);
+ $frontpages[] = $frontpage;
+ }
+ else {
+ debug('xml_sitemap: frontpage ' . $link . ' was disallowed by robots predicate');
+ }
}
}
@@ -147,16 +231,24 @@
'full' => false, 'status'=>'publish'));
$offset += $batch_size;
foreach ($entries as $entry) {
- if (isset($links[$entry['link']])) {
- debug("Duplicate link found for entry " . $entry['uid'] . " and " . $links[$entry['link']]);
+ $link = $entry['link'];
+ //debug('xml_sitemap: entry link is ' . $link);
+ if (isset($links[$link])) {
+ debug("Duplicate link found for entry " . $entry['uid'] . " and " . $links[$link]);
continue;
} else {
- $links[$entry['link']] = $entry['uid'];
+ if(xml_sitemap_is_uri_allowed($disallows, $link)) {
+ $links[$link] = $entry['uid'];
+ }
+ else {
+ debug('xml_sitemap: entry ' . $link . ' was disallowed by robots predicate');
+ continue;
+ }
}
$entry = $PIVOTX['db']->read_entry($entry['uid']);
$item = $output_item;
- $item = str_replace('%loc%', $entry['link'], $item);
+ $item = str_replace('%loc%', $link, $item);
$item = str_replace('%lastmod%', formatDate($entry['edit_date'], '%year%-%month%-%day%'), $item);
$items[] = $item;
}
@@ -188,30 +280,38 @@
continue; // skip it!
}
- $page['link'] = makePageLink($page['uri'], $page['title'], $page['uid'], $page['date']);
+ $link = makePageLink($page['uri'], $page['title'], $page['uid'], $page['date']);
+ //debug('xml_sitemap: page link is ' . $link);
- if (isset($links[$page['link']])) {
- debug("Duplicate link found for page " . $page['uid'] . " and entry/page " . $links[$page['link']]);
+ if (isset($links[$link])) {
+ debug("Duplicate link found for page " . $page['uid'] . " and entry/page " . $links[$link]);
continue;
} else {
- $links[$page['link']] = $page['uid'];
+ if(xml_sitemap_is_uri_allowed($disallows, $link)) {
+ $links[$link] = $page['uid'];
+ }
+ else {
+ debug('xml_sitemap: page ' . $link . ' was disallowed by robots predicate');
+ continue;
+ }
}
$page = $PIVOTX['pages']->getPageByUri($page['uri']);
$item = $output_item;
- $item = str_replace('%loc%', $page['link'], $item);
+ $item = str_replace('%loc%', $link, $item);
$item = str_replace('%lastmod%', formatDate($page['edit_date'], '%year%-%month%-%day%'), $item);
$items[] = $item;
-
}
}
+ // Add explicit URIs
+ xml_sitemap_create_additional($output_item, $items);
+
// Output the Sitemap file as XML
header("content-type: $xml_sitemap_content_type; charset=utf-8");
$output = str_replace('%frontpages%', implode("\n",$frontpages), $output);
echo str_replace('%items%', implode("\n",$items), $output);
die();
-
}
?>