Source for file lucene-defs.php

Documentation is available at lucene-defs.php

  1. <?php
  2. /* ******************************************************************** */
  3. /* CATALYST PHP Source Code */
  4. /* -------------------------------------------------------------------- */
  5. /* This program is free software; you can redistribute it and/or modify */
  6. /* it under the terms of the GNU General Public License as published by */
  7. /* the Free Software Foundation; either version 2 of the License, or */
  8. /* (at your option) any later version. */
  9. /* */
  10. /* This program is distributed in the hope that it will be useful, */
  11. /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
  12. /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
  13. /* GNU General Public License for more details. */
  14. /* */
  15. /* You should have received a copy of the GNU General Public License */
  16. /* along with this program; if not, write to: */
  17. /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
  18. /* Boston, MA 02111-1307 USA */
  19. /* -------------------------------------------------------------------- */
  20. /* */
  21. /* Filename: lucene-defs.php */
  22. /* Author: Paul Waite */
  23. /* Description: NB: This module is a variant of the original lucene */
  24. /* module which processed fields line-by-line. This module */
  25. /* implements the XML interface to Lucene. */
  26. /* */
  27. /* Definitions for interfacing to the LUCENE search */
  28. /* engine system. LUCENE is a system which is optimised */
  29. /* for indexing and searching in a generic way. It is */
  30. /* implemented as a server accessible via a port over TCP. */
  31. /* This module understands the protocol that this server */
  32. /* uses to implement indexing and search queries. */
  33. /* */
  34. /* ******************************************************************** */
  35. /** @package search */
  36. include_once("search-defs.php");
  37. /** Stopwatch microtimer */
  38. ("timer-defs.php");
  39. /** XML classes */
  40. ("xml-defs.php");
  41.  
  42. // ----------------------------------------------------------------------
  43. /** Do not wait on socket receive, return immediately */
  44. ("SOCK_NO_WAIT", 0);
  45. /** Wait on socket forever (well, 24hrs is that, more or less) */
  46. ("SOCK_FOREVER", 86400);
  47. /** Times to retry timed-out socket sends/receives */
  48. ("SOCK_RETRIES", 3);
  49. /** Used to indicate that a field should be indexed by Lucene */
  50. ("INDEXED", true);
  51. /** Used to indicate that a field should NOT be indexed by Lucene */
  52. ("NOT_INDEXED", false);
  53. /** Used to indicate that a field should be stored by Lucene */
  54. ("STORED", true);
  55. /** Used to indicate that a field should NOT be stored by Lucene */
  56. ("NOT_STORED", false);
  57. /** The name of the field Lucene should assume if none specified */
  58. ("DEFAULT_FIELD", "Text");
  59. /** Default type of field: 'Text', 'Date', 'Id' */
  60. ("DEFAULT_FIELDTYPE", "Text");
  61. /** Mode of index ID generation is by incrementing integer */
  62. ("ID_FROM_INC", 0);
  63. /** Mode of index ID generation is by filename stripped of path and extension */
  64. ("ID_FROM_NAME", 1);
  65. /** Mode of index ID generation is by full filename (incl. extension) */
  66. ("ID_FROM_FILENAME", 2);
  67. /** Mode of index ID generation is by full path to file */
  68. ("ID_FROM_PATH", 3);
  69. /** Indicates index fields come from meta tag extraction */
  70. ("META_TAG_FIELDS", true);
  71.  
  72. // ----------------------------------------------------------------------
  73. /**
  74. * The lucene connection class
  75. * This class inherits the functionality of the 'search' class since mostly
  76. * that is what we will be connecting to Lucene for. The Indexing and
  77. * Control descendants can just ignore this inherited basic searching
  78. * functionality.
  79. * This class knows how to connect to a Lucene server and send and
  80. * receive messages to/from it. Child classes which need to talk to this
  81. * server to do indexing or querying should inherit this class.
  82. * @package search
  83. */
  84. class lucene_connection extends search {
  85. // Public
  86. /** HOST running the Lucene query server */
  87.  
  88. var $host = "";
  89. /** PORT that the server is listening on */
  90.  
  91. var $port = "";
  92. /** Timeout for send in seconds */
  93.  
  94. var $timeoutsecs = 10;
  95.  
  96. // Private
  97. /** Whether Lucene is enabled..
  98. @access private */
  99. var $enabled = true;
  100. /** The message waiting to be sent
  101. @access private */
  102. var $message = "";
  103. /** Raw response content we receive back from the Lucene server
  104. @access private */
  105. var $responsebuf = "";
  106. /** Socket file pointer
  107. @access private */
  108. var $sockfp = false;
  109. /** True if we are connected to socket
  110. @access private */
  111. var $connected = false;
  112. /** An execution timer
  113. @access private */
  114. var $timer;
  115. // .....................................................................
  116. /** Constructor - Lucene connection
  117. * @param string $host Hostname or IP of Lucene server
  118. * @param string $port Port of Lucene server
  119. * @param integer $timeoutsecs Seconds to timeout the connection
  120. */
  121. function lucene_connection($host="", $port="", $timeoutsecs="") {
  122. debugbr("Lucene connection: using XML interface v1.0");
  123. if ($host != "") {
  124. $this->connect($host, $port, $timeoutsecs);
  125. }
  126. $this->timer = new microtimer();
  127. } // lucene_connection
  128. // .....................................................................
  129. /**
  130. * Connect to the Lucene server. Optionally over-ride various settings
  131. * which were set in the constructor. Normally this method is only
  132. * called internally, in response to a request to send a message to
  133. * the Luceneserver.
  134. * @access private
  135. * @param string $host Hostname or IP of Lucene server
  136. * @param string $port Port of Lucene server
  137. * @param integer $timeoutsecs Seconds to timeout the connection
  138. */
  139. function connect($host="", $port="", $timeoutsecs="") {
  140. // Override host and port if given..
  141. if ($host != "") $this->host = $host;
  142. if ($port != "") $this->port = $port;
  143.  
  144. // Utilise the Axyl configuration settings, if available..
  145. if (class_exists("configuration")) {
  146. $config = new configuration("sys_control");
  147. // This controls whether we have Lucene capability or not..
  148. if ($config->field_exists("Lucene Site Indexing")) {
  149. $this->enabled = $config->value("Lucene Site Indexing");
  150. }
  151. // Only set host & port if they have not been given yet..
  152. if ($this->host == "") {
  153. $this->host = $config->value("Lucene Host");
  154. $this->port = $config->value("Lucene Port");
  155. debugbr("acquired Axyl config: host=$this->host, port=$this->port");
  156. }
  157. }
  158. // Try to open socket if we have a host..
  159. $this->connected = false;
  160. if ($this->enabled && $this->host != "") {
  161. $this->sockfp = fsockopen($this->host, $this->port);
  162. if(!$this->sockfp) {
  163. $this->log_error("failed to connect to '$this->host:$this->port'");
  164. }
  165. else {
  166. if ($timeoutsecs != "") $this->timeoutsecs = $timeoutsecs;
  167. $this->set_timeout($this->timeoutsecs);
  168. $this->connected = true;
  169. debugbr("lucene_connection: connected to '$this->host:$this->port'");
  170. }
  171. }
  172. // Return result..
  173. return $this->connected;
  174. } // connect
  175. // .....................................................................
  176. /**
  177. * Disconnect from the Lucene server. Normally this is used only by
  178. * internal Luceneserver methods.
  179. * @access private
  180. */
  181. function disconnect() {
  182. if ($this->connected) {
  183. fclose($this->sockfp);
  184. $this->sockfp = false;
  185. }
  186. } // disconnect
  187. // .....................................................................
  188. /**
  189. * Set the socket timeout. Deals with the special case of setting
  190. * the socket to non-blocking mode (zero timeout)..
  191. * @param integer $timeoutsecs Set the timeout in seconds
  192. */
  193. function set_timeout($timeoutsecs) {
  194. if ($this->connected && $timeoutsecs != "") {
  195. $this->timeoutsecs = $timeoutsecs;
  196. if ($this->timeoutsecs != SOCK_NO_WAIT) {
  197. socket_set_timeout( $this->sockfp, $this->timeoutsecs);
  198. }
  199. socket_set_blocking( $this->sockfp, (($this->timeoutsecs == SOCK_NO_WAIT) ? false : true) );
  200. }
  201. } // set_timeout
  202. // .....................................................................
  203. /**
  204. * Sends a message to the Lucene server, and receives the response. We
  205. * operate on the understanding that every time we send something to
  206. * Lucene we expect a response. Since this method already calls the
  207. * recieve() method, there is no need to call it from your application.
  208. * The content to be sent is expected to be already in the class
  209. * string variable $message. The response is put into $response which
  210. * is an array of LF-delimited lines sent back.
  211. * @param integer $timeoutsecs Override for timeout in seconds
  212. * @return boolean True if the message was sent ok
  213. */
  214. function send($timeoutsecs="") {
  215. $send_ok = true;
  216. $this->response = array();
  217. if (!$this->connected) {
  218. $this->connect();
  219. }
  220. if ($this->connected) {
  221. // Check for timeout over-ride..
  222. if ($timeoutsecs != "") $this->timeoutsecs = $timeoutsecs;
  223. $this->set_timeout($this->timeoutsecs);
  224. // Send message..
  225. if ($this->message != "") {
  226. $this->timer->restart();
  227. $bytesput = fputs($this->sockfp, $this->message);
  228. $this->timer->stop();
  229. if (debugging()) {
  230. $buf = trim(substr(rawurldecode($this->message),0, 5000));
  231. debugbr("<pre>" . xmldump($buf) . "</pre>", DBG_DUMP);
  232. debugbr("lucene_connection: send transaction took " . $this->timer->formatted_millisecs() . "mS");
  233. }
  234. if ($bytesput != -1) {
  235. debugbr("lucene_connection: send ok ($bytesput bytes)");
  236. for ($i=0; $i< SOCK_RETRIES; $i++) {
  237. $send_ok = $this->receive();
  238. if ($send_ok) break;
  239. debugbr("lucene_connection: receive retry #" . ($i + 1));
  240. }
  241. }
  242. else {
  243. $this->log_error("write to server failed");
  244. $send_ok = false;
  245. }
  246. }
  247. else {
  248. $this->log_error("trying to send null content");
  249. $send_ok = false;
  250. }
  251. }
  252. else {
  253. $this->log_error("send with no open socket");
  254. $send_ok = false;
  255. }
  256. // Return status..
  257. return $send_ok;
  258. } // send
  259. // .....................................................................
  260. /**
  261. * Receive a message from the Lucene server. We can specify a timeout
  262. * period in seconds. If set to SOCK_NO_WAIT, it will return immediately with or
  263. * without a message. This is a low-level routine which deals with receiving the
  264. * message over TCP sockets.
  265. * @return boolean True if the message was received loud and clear
  266. * @access private
  267. */
  268. function receive() {
  269. $received_ok = true;
  270. if ($this->connected) {
  271. $this->timer->restart();
  272. $this->responsebuf = "";
  273. while (!feof($this->sockfp)) {
  274. $buf = fread($this->sockfp, 10000);
  275. if ($buf !== false) {
  276. $this->responsebuf .= $buf;
  277. }
  278. else {
  279. $this->log_error("no response from server");
  280. $received_ok = false;
  281. break;
  282. }
  283. }
  284. $this->timer->stop();
  285. if (debugging()) {
  286. debugbr("<pre>" . xmldump($this->responsebuf) . "</pre>", DBG_DUMP);
  287. debugbr("lucene_connection: response from server took " . $this->timer->formatted_millisecs() . "mS");
  288. }
  289. }
  290. else {
  291. $this->log_error("receive with no open socket");
  292. $received_ok = false;
  293. }
  294. // Return status..
  295. return $received_ok;
  296. } // receive
  297. // .....................................................................
  298. /** Log a message to the syslog and print info to debugger.
  299. * @access private
  300. */
  301. function log_error($err) {
  302. $prefix = (defined("APP_NAME") ? APP_NAME . ": " : "");
  303. $err = "Lucene error: " . get_class($this) . ": $this->host:$this->port: $err";
  304. debugbr($err);
  305. error_log($prefix . $err, 0);
  306. } // log_error
  307.  
  308. } // lucene_connection class
  309. // ----------------------------------------------------------------------
  310.  
  311. /** The lucene fieldset class. This holds the Lucene fields for a lucene
  312. * message. These fields comprise the list of tags which make up
  313. * a query message or an index message.
  314. * @access private
  315. * @package search
  316. */
  317. class lucene_fieldset {
  318. /** Fields stored as an array of XML <Field> tags */
  319.  
  320. var $xmltags = array();
  321. // .....................................................................
  322. /** Constructor */
  323.  
  324. function lucene_fieldset() { }
  325. // .....................................................................
  326. /**
  327. * Return a copy of the named field object from fieldset by name.
  328. * NOTES: This function will return a new field if it does not already
  329. * exist. In this case the field will not be stored until you use the
  330. * put() method to do so. Always returns a field object.
  331. * @param string $fieldname The name of the field to get
  332. * @return object An xmltag object for the field
  333. */
  334. function get_field($fieldname) {
  335. if (isset($this->xmltags[$fieldname])) {
  336. $field = $this->xmltags[$fieldname];
  337. }
  338. else {
  339. $field = new xmltag("Field");
  340. $field->setattribute("name", $fieldname);
  341. }
  342. return $field;
  343. } // get_field
  344. // .....................................................................
  345. /**
  346. * Puts the named field into fieldset, indexed by fieldname.
  347. * @param string $fieldname Unique name of the field in the set
  348. * @param object $field The field object to store
  349. */
  350. function put_field($fieldname, $field) {
  351. $this->xmltags[$fieldname] = $field;
  352. } // put_field
  353. // .....................................................................
  354. /** Define a field in the fieldset. Set the definition for a field
  355. * in this fieldset. If the field does not exist it is created and
  356. * its definition set. If it exists the definition is updated.
  357. * @param string $fieldname Name of the field
  358. * @param string $type Type of this field eg. "Date"
  359. * @param boolean $stored Whether field value should be stored by Lucene
  360. * @param boolean $indexed Whether field value should be indexed by Lucene
  361. */
  362. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  363. $field = $this->get_field($fieldname);
  364. $field->setattribute("type", $type);
  365. $field->setattribute("stored", ($stored ? "true" : "false"));
  366. $field->setattribute("indexed", ($indexed ? "true" : "false"));
  367. $this->put_field($fieldname, $field);
  368. } // define_field
  369. // .....................................................................
  370. /** Add a field to the fieldset.
  371. * @param string $fieldname Name of the field
  372. * @param string $fieldvalue Value to associate with this field
  373. */
  374. function add_field($fieldname, $fieldvalue="") {
  375. $field = $this->get_field($fieldname);
  376. $field->value = $fieldvalue;
  377. $this->put_field($fieldname, $field);
  378. } // add_field
  379. // .....................................................................
  380. /** Clear all fields from the fieldset */
  381.  
  382. function clear() {
  383. $this->xmltags = array();
  384. } // clear
  385. // .....................................................................
  386. function render() {
  387. $s = "";
  388. foreach ($this->xmltags as $field) {
  389. $s .= $field->render();
  390. }
  391. return $s;
  392. } // render
  393.  
  394. } // lucene_fieldset class
  395. // ----------------------------------------------------------------------
  396.  
  397. /**
  398. * The lucene msg class. This is a raw class which holds the basic
  399. * message fields and data and knows how to build them into a full
  400. * message for sending to the lucene server.
  401. * @package search
  402. */
  403. class lucene_msg extends lucene_connection {
  404. // Public
  405. /** Type/name of this message */
  406.  
  407. var $type = "";
  408.  
  409. // Private
  410. /** Array containing XML tags
  411. @access private */
  412. var $xmltags = array();
  413. /** Object containing lucene fields
  414. @access private */
  415. var $fieldset;
  416. /** True if message has been built
  417. @access private */
  418. var $built = false;
  419. /** Error message if any error occurred
  420. @access private */
  421. var $error_msg = "";
  422. // .....................................................................
  423. /** Constructor
  424. * Notes: The application is either specified in the formal paramters or it
  425. * can be determined for an Axyl application by using the APP_PREFIX which
  426. * is unique to the application. This is the recommended option. Other
  427. * developers have, however, also used the configvalue 'Lucene Application'
  428. * for some reason, so this is still supported here. If none of these
  429. * methods results in a valid identifier, 'default' is used.
  430. * @param string $type Type of message this is, eg; QUERY, INDEX..
  431. * @param string $application The application name. Sets default Lucene config.
  432. * @param string $host Hostname or IP of Lucene server
  433. * @param string $port Port of Lucene server
  434. */
  435. function lucene_msg($type="", $application="?", $host="", $port="") {
  436. $this->lucene_connection($host, $port);
  437. $this->type = $type;
  438. $this->fieldset = new lucene_fieldset();
  439. // We must have an application..
  440. if ($application == "?") {
  441. if (class_exists("configuration")) {
  442. $config = new configuration("sys_control");
  443. $application = $config->value("Lucene Application");
  444. }
  445. // Axyl configuration value may not be defined and
  446. // the APP_PREFIX will be used in this case..
  447. if ($application == "" || $application == "?") {
  448. if ( defined("APP_PREFIX")) {
  449. $application = APP_PREFIX;
  450. }
  451. else {
  452. // The default case for standalone apps..
  453. $application = "default";
  454. }
  455. }
  456. }
  457. // Set the application..
  458. $this->set_application($application);
  459. } // lucene_msg
  460. // .....................................................................
  461. /**
  462. * Add a new XML tag object to this Lucene message
  463. * @param object $tag Tha xmltag object to add to our lucene msg
  464. */
  465. function add_xmltag($tag) {
  466. $this->xmltags[] = $tag;
  467. $this->built = false;
  468. } // add_xmltag
  469. // .....................................................................
  470. /**
  471. * Specify the application. The application is the name of a configuration
  472. * set which has been specified either by a control message, or by using
  473. * configuration files on the server. A given configuration set identified
  474. * by an application name can have specific fields already defined, such
  475. * as Sort: or Domain: etc.
  476. * Notes: The 'Application' header can only appear once in the message.
  477. * @param string $application The application name to set.
  478. */
  479. function set_application($application) {
  480. $this->add_xmltag( new xmltag("Application", $application) );
  481. } // set_application
  482. // .....................................................................
  483. /**
  484. * Specify a domain. A domain is an identifier which groups indexed
  485. * objects internally to Lucene. This allows searches on multiple
  486. * archives of documents in a single Lucene installation.
  487. * Notes: There may be zero or more domain headers in the message. If it
  488. * does not appear, then any domain header defined for the application
  489. * will be applied on its own. Otherwise any definitions added by this
  490. * method are OR'd with any specified in the application config.
  491. * NB: If no domains are specified anywhere, any searching will be done
  492. * across all domains (which would probably yield very confusing return
  493. * data!).
  494. * @param string $domain The domain to set.
  495. */
  496. function set_domain($domain) {
  497. $this->add_xmltag( new xmltag("Domain", $domain) );
  498. } // set_domain
  499. // .....................................................................
  500. /** Add a field to the fieldset.
  501. * @param string $fieldname Name of the field
  502. * @param string $fieldvalue Value to associate with this field
  503. */
  504. function add_field($fieldname, $fieldvalue="") {
  505. $this->fieldset->add_field($fieldname, $fieldvalue);
  506. $this->built = false;
  507. } // add_field
  508. // .....................................................................
  509. /** Clear all data/fields, leaving type definition alone. */
  510.  
  511. function clear() {
  512. $this->fieldset->clear();
  513. $this->message = "";
  514. $this->built = false;
  515. } // clear
  516. // .....................................................................
  517. /**
  518. * Builds the message according to the message type. This method
  519. * may be over-ridden in children inheriting this class
  520. * @access private
  521. */
  522. function build() {
  523. if (!$this->built) {
  524. if ($this->type != "") {
  525. $xml = new xmltag($this->type);
  526. // XML TAGS
  527. foreach ($this->xmltags as $tag) {
  528. $xml->childtag($tag);
  529. }
  530. // FIELDS
  531. if (count($this->fieldset->xmltags) > 0) {
  532. $fields = new xmltag("Fields");
  533. foreach ($this->fieldset->xmltags as $field) {
  534. $fields->childtag($field);
  535. }
  536. $xml->childtag($fields);
  537. }
  538. $this->message = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $xml->render();
  539. $this->built = true;
  540. }
  541. }
  542. return $this->built;
  543. } // build
  544. // .....................................................................
  545. /**
  546. * Sends the current message to Lucene, and checks for protocol
  547. * errors in the received response.
  548. * @param integer $timeoutsecs Override for timeout in seconds
  549. */
  550. function send($timeoutsecs="") {
  551. if ($this->build()) {
  552. // Low-level socket send-receive transaction..
  553. lucene_connection::send($timeoutsecs);
  554. // Once a msg is sent, socket can be closed..
  555. $this->disconnect();
  556. }
  557. } // send
  558.  
  559. } // lucene_msg class
  560. // ----------------------------------------------------------------------
  561.  
  562. /**
  563. * The lucene message class. This class extends its parent class
  564. * lucene_msg and adds some higher level methods for adding groups of
  565. * fields to the message.
  566. * @package search
  567. */
  568. class lucene_message extends lucene_msg {
  569. /** Response object which will parse XML content
  570. @access private */
  571. var $response;
  572. // .....................................................................
  573. /** Constructor
  574. * This is a more complex class which builds on the basic lucene_msg
  575. * class to provide some higher level methods for adding fields in
  576. * specific ways to support CONTROL, QUERY and INDEX message types.
  577. * @param string $type Type of message this is, eg; QUERY, INDEX..
  578. * @param string $application The application name. Sets default Lucene config.
  579. * @param string $host Hostname or IP of Lucene server
  580. * @param string $port Port of Lucene server
  581. */
  582. function lucene_message($type="", $application="?", $host="", $port="") {
  583. $this->lucene_msg($type, $application, $host, $port);
  584. } // lucene_message
  585. // .....................................................................
  586. /**
  587. * Strip field type specifiers out of field strings. A field string with
  588. * a type specifier in it is of the form: 'Foo:Date', where the field
  589. * name is 'Foo' and the field type is 'Date'. Possible field types are
  590. * 'Id', 'Text' (the default), and 'Date'.
  591. * Note that sort field specification is a special case, where the syntax
  592. * can be 'Foo:Date:Desc' or 'Foo:Desc' indicating the sort on the given
  593. * field should be done in descending order.
  594. * At present you would only use this facility with a 'Date' field, and
  595. * everything else would then default to 'Text'. [The 'Id' type being a
  596. * special one]
  597. * We return the field stripped of any type, and if a type was present
  598. * we issue the define_field() directive to define it. A field so-defined
  599. * will always be both stored by Lucene and indexed.
  600. * @param string $field Field in 'Foo:Date' format, or just 'Foo' for default type
  601. * @return string The fieldname stripped of any type specifier
  602. * @access private
  603. */
  604. function strip_field_type($field) {
  605. $fieldname = $field;
  606. $retfieldname = $field;
  607. if (strstr($field, ":")) {
  608. // Extract field specifier parts..
  609. $bits = explode(":", $field);
  610. $fieldname = trim( array_shift($bits) );
  611. $retfieldname = $fieldname;
  612. $f1 = trim(array_shift($bits));
  613. $f2 = trim(array_shift($bits));
  614. // Check for a sort field with DESC specifier..
  615. if ($f1 == "Desc" || $f2 == "Desc") {
  616. $retfieldname .= ":Desc";
  617. }
  618. // Check for valid field type specifier..
  619. if ($f1 == "Date" || $f1 == "Text" || $f1 == "Id") {
  620. // Define field by name..
  621. $this->define_field($fieldname, $f1);
  622. }
  623. }
  624. // Return fieldname plus any sort spec..
  625. return $retfieldname;
  626. } // strip_field_type
  627. // .....................................................................
  628. /**
  629. * Define a field. We supply the name of the field, it's type (Text, Date
  630. * or Id), and whether it should be stored by Lucene for later retreival
  631. * in queries. For example you would not store the raw document/content as
  632. * this is usually stored elsewhere.
  633. * We also cater for fields which might not need to be indexed. These would
  634. * be fields of data you just want to return with the document, if found in
  635. * a query, but not search on. An example might be a field containing the
  636. * path to the physical document on disk. For these fields you would then
  637. * specify NOT_INDEXED for the $indexed parameter. These fields MUST be
  638. * stored, so we make the rule: if the field is NOT_INDEXED then it must
  639. * be STORED (this will be forced).
  640. * In the normal course of events, fields will be defined to be both stored
  641. * and indexed. The exception is the special "Text" field associated with
  642. * an item "Body", which is indexed, but never stored.
  643. * This method adds the field settings directly via the add_field() method.
  644. * @see add_field()
  645. * @param string $fieldname Name of the field to index
  646. * @param string $type Type of field data: Text, Date or Id.
  647. * @param boolean $stored If true then Lucene will store the content itself
  648. * @param boolean $indexed If true then Lucene will index the field content
  649. */
  650. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  651. // Force non-indexed fields to be stored..
  652. if ($indexed == NOT_INDEXED) $stored = STORED;
  653. $this->fieldset->define_field($fieldname, $type, $stored, $indexed);
  654. } // define_field
  655. // .....................................................................
  656. /**
  657. * Specify the fields you want returned from Lucene.
  658. * Fields should be in a comma-separated list of field names. Each field
  659. * name can have the field type included in the form 'Foo:Date', where
  660. * 'Date' is the type in this instance. In fact, since 'Text' is the
  661. * default filed type, 'Date' is probably the only one you need to use
  662. * as the current implementation stands.
  663. * This method adds the field setting directly via the add_field() method.
  664. * @see add_field
  665. * @param mixed $fields Comma-delimited fieldname list, or array of fields
  666. */
  667. function set_returnfields($fields) {
  668. if (!is_array($fields)) {
  669. $flds = explode(",", $fields);
  670. }
  671. else {
  672. $flds = $fields;
  673. }
  674. $returnfields = array();
  675. foreach ($flds as $field) {
  676. $returnfields[] = $this->strip_field_type($field);
  677. }
  678. $returnlist = implode(" ", $returnfields);
  679. $this->add_xmltag( new xmltag("Return", $returnlist) );
  680. } // set_returnfields
  681. // .....................................................................
  682. /**
  683. * Specify query limit field. This sets the maximum number of results
  684. * that Lucene should return.
  685. * @param integer $limit Maximum number of results (hits) to return
  686. */
  687. function set_limit($limit) {
  688. $this->add_xmltag( new xmltag("Limit", $limit) );
  689. } // set_limit
  690. // .....................................................................
  691. /**
  692. * Specify query offset field 'First'. This sets the offset for the
  693. * returned results. For example, if this was set to 3, and Lucene
  694. * found 20 hits, then results would be sent back from the 3rd hit
  695. * onwards.
  696. * @param integer $first Offset in result set to start from
  697. */
  698. function set_first($first) {
  699. $this->add_xmltag( new xmltag("First", $first) );
  700. } // set_first
  701. // .....................................................................
  702. /**
  703. * Specify the fields you want query results to be ordered by.
  704. * Fields should be in a comma-separated list of field names. Each field
  705. * name can have the field type included in the form 'Foo:Date', where
  706. * 'Date' is the type in this instance. In fact, since 'Text' is the
  707. * default filed type, 'Date' is probably the only one you need to use
  708. * as the current implementation stands.
  709. * Note that sort field specification is a special case, where the syntax
  710. * can be 'Foo:Date:Desc' or 'Foo:Desc' indicating the sort on the given
  711. * field should be done in descending order.
  712. * @param mixed $fields Comma-delimited fieldname list, or array of fields
  713. */
  714. function set_sortorder($fields) {
  715. if (!is_array($fields)) {
  716. $flds = explode(",", $fields);
  717. }
  718. else {
  719. $flds = $fields;
  720. }
  721. $sortfields = array();
  722. foreach ($flds as $field) {
  723. $sortfields[] = $this->strip_field_type($field);
  724. }
  725. // Create the field..
  726. $sortlist = implode(" ", $sortfields);
  727. $this->add_xmltag( new xmltag("Sort", $sortlist) );
  728. } // set_sortorder
  729. // .....................................................................
  730. /**
  731. * Specify a range on a field for querying. We specify the name of a field
  732. * which is used to select articles within the given limits, and
  733. * the limits themeselves. Either limit may be passed as nullstring
  734. * which indicates no limit on that side. Any dates must be passed as
  735. * standard Unix timestamps (seconds since 1970).
  736. * Notes: This method can be called multiple times to define additional
  737. * ranges for different field names.
  738. * This method adds the field setting directly via the add_field() method.
  739. * @see add_field
  740. * @param string $range_from Value of lowerbound range
  741. * @param string $range_to Value of upperbound range
  742. * @param string $range_fieldname Name of field to use in range query.
  743. */
  744. function set_range($range_from="", $range_to="", $range_fieldname="") {
  745. if ($range_fieldname != "") {
  746. $range = new xmltag("Range");
  747. $range->setattribute("field", $this->strip_field_type($range_fieldname));
  748. if ($range_from != "" && $range_from != false) {
  749. $range->childtag( new xmltag("From", $range_from) );
  750. }
  751. if ($range_to != "" && $range_to != false) {
  752. $range->childtag( new xmltag("To", $range_to) );
  753. }
  754. $this->add_xmltag( $range );
  755. }
  756. } // set_range
  757. // .....................................................................
  758. /**
  759. * Supply a stopword list to lucene.
  760. * This method adds the field setting directly via the add_field() method.
  761. * @see add_field
  762. * @param mixed $stopwords Space-delimited list, or array of stopwords
  763. */
  764. function set_stopwords($stopwords) {
  765. if (is_array($stopwords)) {
  766. $mystops = implode(" ", $stopwords);
  767. }
  768. else {
  769. $mystops = $stopwords;
  770. }
  771. $this->add_xmltag( new xmltag("Stop-List", $mystops) );
  772. } // set_stopwords
  773.  
  774. } // lucene_message class
  775. // ----------------------------------------------------------------------
  776.  
  777. /**
  778. * Encapsulation of the result of a generic search query. This is for
  779. * internal use only.
  780. * @package search
  781. * @access private
  782. */
  783. class queryresult {
  784. var $rank = "";
  785. var $fields = array();
  786.  
  787. function queryresult($rank="") {
  788. $this->rank = $rank;
  789. }
  790. function addfield($fieldname, $fieldvalue) {
  791. $this->fields[$fieldname] = $fieldvalue;
  792. }
  793. } // queryresult class
  794. // ----------------------------------------------------------------------
  795.  
  796. /**
  797. * Class comprising the functionality of a Lucene response parser. This
  798. * is for internal use only.
  799. * @package search
  800. * @access private
  801. */
  802. class response_parser extends xmlparser {
  803. /** Current/last tag opened */
  804.  
  805. var $tag = "";
  806. /** Attributes array for current/last tag */
  807.  
  808. var $attr = array();
  809. /** Serial transaction ID */
  810.  
  811. var $serial = "";
  812. /** Status message */
  813.  
  814. var $status_message = "";
  815. /** True if response was valid, ie. no errors */
  816.  
  817. var $valid = true;
  818. // .....................................................................
  819. /** Construct a new parser. */
  820.  
  821. function response_parser() {
  822. $this->xmlparser();
  823. } // response_parser
  824. // .....................................................................
  825. /** Method invoked when a tag is opened */
  826.  
  827. function tag_open($parser, $tag, $attributes) {
  828. $this->tag = $tag;
  829. if (is_array($attributes) && count($attributes) > 0) {
  830. foreach ($attributes as $key => $value ) {
  831. $this->attr[$key] = $value;
  832. }
  833. }
  834. switch ($tag) {
  835. case "Error":
  836. $this->valid = false;
  837. break;
  838. } // switch
  839. } // tag_open
  840. // .....................................................................
  841. /** Method invoked when character data is available */
  842.  
  843. function cdata($parser, $cdata) {
  844. switch ($this->tag) {
  845. case "Error":
  846. $this->error_message = $cdata;
  847. debugbr("lucene error: $this->error_message");
  848. break;
  849. case "Status":
  850. $this->status_message = $cdata;
  851. debugbr("lucene status: $this->status_message");
  852. break;
  853. case "Serial":
  854. $this->serial = $cdata;
  855. break;
  856. } // switch
  857. } // cdata
  858. // .....................................................................
  859. /** Method invoked when a tag is closed */
  860.  
  861. function tag_close($parser, $tag) {
  862. $this->tag = "";
  863. $this->attr = array();
  864. } // tag_close
  865. // .....................................................................
  866. function parse($xml) {
  867. xmlparser::parse($xml);
  868. if (!$this->valid_xml) {
  869. $this->valid = false;
  870. }
  871. if ($this->error_message != "") {
  872. log_sys($this->error_message);
  873. }
  874. } // parse
  875.  
  876. } // response_parser class
  877. // ----------------------------------------------------------------------
  878.  
  879. /**
  880. * Class comprising the functionality of an XML parser for queries. This
  881. * is for internal use only.
  882. * @package search
  883. * @access private
  884. */
  885. class queryresponse_parser extends response_parser {
  886. /** Results returned count */
  887.  
  888. var $count = 0;
  889. var $results;
  890. var $results_stream = false;
  891. // .....................................................................
  892. /** Construct a new parser. */
  893.  
  894. function queryresponse_parser() {
  895. $this->response_parser();
  896. } // queryresponse_parser
  897. // .....................................................................
  898. /** Method invoked when a tag is opened */
  899.  
  900. function tag_open($parser, $tag, $attributes) {
  901. response_parser::tag_open($parser, $tag, $attributes);
  902. switch ($tag) {
  903. case "Results":
  904. $this->results_stream = true;
  905. break;
  906. case "Result":
  907. $this->addresult(
  908. $this->attr["counter"],
  909. $this->attr["rank"]
  910. );
  911. $this->attr = array();
  912. break;
  913. } // switch
  914. } // tag_open
  915. // .....................................................................
  916. /** Method invoked when character data is available */
  917.  
  918. function cdata($parser, $cdata) {
  919. response_parser::cdata($parser, $cdata);
  920. switch ($this->tag) {
  921. case "Count":
  922. $this->count = $cdata;
  923. break;
  924. case "Field":
  925. if ($this->results_stream) {
  926. if (count($this->attr) > 0) {
  927. $result = array_pop($this->results);
  928. $fieldname = $this->attr["name"];
  929. $fieldval = $cdata;
  930. $result->addfield($fieldname, $fieldval);
  931. array_push($this->results, $result);
  932. }
  933. $this->attr = array();
  934. }
  935. break;
  936. } // switch
  937. } // cdata
  938. // .....................................................................
  939. /** Method invoked when a tag is closed */
  940.  
  941. function tag_close($parser, $tag) {
  942. response_parser::tag_close($parser, $tag);
  943. switch ($tag) {
  944. case "Results":
  945. $this->results_stream = false;
  946. break;
  947. } // switch
  948. } // tag_close
  949. // .....................................................................
  950. /** Add a result field to the response */
  951.  
  952. function addresult($id, $rank) {
  953. $this->results[$id] = new queryresult($rank);
  954. } // addresult
  955.  
  956. } // queryresponse_parser class
  957. // ----------------------------------------------------------------------
  958.  
  959. /**
  960. * The lucene query message class. This class inherits all the functionality
  961. * of the lucene_connection, lucene_msg and lucene_message classes. It adds
  962. * query-specific methods for searching.
  963. * @package search
  964. */
  965. class lucene_querymsg extends lucene_message {
  966. /** Set to true if sort limit was exceeded in query */
  967.  
  968. var $sort_limit_exceeded = false;
  969. /** Set to true if Lucene blew its memory trying to sort */
  970.  
  971. var $sort_memory_exceeded = false;
  972. // .....................................................................
  973. /** Constructor
  974. * Make a new Lucene query message. You can specify the application to
  975. * use here, and also an optional query string to send.
  976. * @param string $application Optional application specifier.
  977. * @param string $host Hostname or IP of Lucene server
  978. * @param string $port Port of Lucene server
  979. */
  980. function lucene_querymsg($application="?", $host="", $port="") {
  981. $this->lucene_message("LuceneQueryRequest", $application, $host, $port);
  982. } // lucene_querymsg
  983. // .....................................................................
  984. /**
  985. * Set the query for this message. There can be only one query defined.
  986. * This method can be called repeatedly, and each time it is called the
  987. * new value will replace the old one.
  988. * @param string $query The query to submit to Lucene.
  989. */
  990. function set_query($query) {
  991. $queryxml = new xmltag("Query", $query);
  992. $queryxml->setattribute("default-field", DEFAULT_FIELD);
  993. $this->add_xmltag($queryxml);
  994. } // set_query
  995. // .....................................................................
  996. /**
  997. * Send the message to Lucene, and then post-process the response for
  998. * query hits. The hitcount is extracted, followed by the hits, which
  999. * may comprise multiple fields. A hit is thus defined as an array of
  1000. * fields, and each hit is put into a single container array called
  1001. * 'hit', which is a property of the parent class 'search'.
  1002. * @param integer $timeoutsecs Override for timeout in seconds
  1003. */
  1004. function send($timeoutsecs="") {
  1005. // Initialise flags..
  1006. $this->sort_limit_exceeded = false;
  1007. $this->sort_memory_exceeded = false;
  1008.  
  1009. // Msg-level send-receive transaction..
  1010. lucene_message::send($timeoutsecs);
  1011.  
  1012. // Process the response to our request..
  1013. $this->response = new queryresponse_parser();
  1014. $this->response->parse($this->responsebuf);
  1015.  
  1016. // Unpack the response if no errors..
  1017. if ($this->response->valid) {
  1018. // Here we will unpack the returned search query hits
  1019. // and store them locally for use by child classes.
  1020. if (isset($this->response->results)) {
  1021. foreach ($this->response->results as $result) {
  1022. $hit = array();
  1023. $hit["RANK"] = $result->rank;
  1024. foreach ($result->fields as $fieldname => $fieldvalue) {
  1025. $hit[$fieldname] = $fieldvalue;
  1026. }
  1027. $this->hit[] = $hit;
  1028. }
  1029. }
  1030. }
  1031. else {
  1032. // Check for sort limit/memory error conditions..
  1033. if (stristr($this->response->error_message, "system sort limit")) {
  1034. $this->sort_limit_exceeded = true;
  1035. }
  1036. if (stristr($this->response->error_message, "out of memory")) {
  1037. $this->sort_memory_exceeded = true;
  1038. }
  1039. }
  1040. } // send
  1041.  
  1042. } // lucene_querymsg class
  1043. // ----------------------------------------------------------------------
  1044.  
  1045. /**
  1046. * The lucene index message class. This class inherits all the functionality
  1047. * of the lucene_connection, lucene_msg and lucene_message classes. It adds
  1048. * indexing-specific methods.
  1049. * @package search
  1050. */
  1051. class lucene_indexmsg extends lucene_message {
  1052. // Public
  1053. /** Indication that the indexing was successful */
  1054.  
  1055. var $indexed = false;
  1056.  
  1057. // Private
  1058. /** A unique handle to identify the index
  1059. response from Lucene
  1060. @access private */
  1061. var $serialno = "";
  1062. // .....................................................................
  1063. /** Constructor
  1064. * Make a new Lucene index message.
  1065. * @param string $application Optional application specifier
  1066. * @param string $host Hostname or IP of Lucene server
  1067. * @param string $port Port of Lucene server
  1068. */
  1069. function lucene_indexmsg($application="?", $host="", $port="") {
  1070. global $RESPONSE;
  1071. $this->lucene_message("LuceneIndexRequest", $application, $host, $port);
  1072. $this->serialno = md5(uniqid(""));
  1073. $this->add_xmltag( new xmltag("Serial", $this->serialno) );
  1074. $this->define_field(DEFAULT_FIELD, DEFAULT_FIELDTYPE, NOT_STORED);
  1075. // Partitioned indexing for microsites..
  1076. if (isset($RESPONSE) && $RESPONSE->microsites_mode == MICROSITES_ENABLED) {
  1077. $site = (isset($RESPONSE->microsite_detected)) ? $RESPONSE->microsite_detected : APP_NAME;
  1078. $this->define_field("site", "text", STORED, INDEXED);
  1079. $this->index_field("site", $site);
  1080. }
  1081. } // lucene_indexmsg
  1082. // .....................................................................
  1083. /**
  1084. * Supply field content for indexing. This causes Lucene to take the given
  1085. * fieldname and index the given value against it. NB: we silently ignore
  1086. * the request for nullstring, since these cause Lucene indexing to throw
  1087. * an exception, and indexing will fail.
  1088. * The field name can have the field type included in the form 'Foo:Date',
  1089. * where 'Date' is the type in this instance. In fact, since 'Text' is the
  1090. * default filed type, 'Date' is probably the only one you need to use
  1091. * as the current implementation stands.
  1092. * @param string $fieldname Name of the field to index.
  1093. * @param string $fieldvalue Content of the field to index
  1094. */
  1095. function index_field($fieldname, $fieldvalue) {
  1096. if ($fieldvalue !== "") {
  1097. $fieldname = $this->strip_field_type($fieldname);
  1098. $this->add_field($fieldname, $fieldvalue);
  1099. }
  1100. } // index_field
  1101. // .....................................................................
  1102. /**
  1103. * Index the given content against the given ID. This automatically
  1104. * defines the default field called "Text", and the data added as a field
  1105. * called "Text" as well. Attaches the "Body" tag to this field via a
  1106. * call to add_data() method. Thus, the content is submitted as a raw
  1107. * binary stream, rather than url-encoded text.
  1108. * @param string $id The ID to associate with the given indexed data.
  1109. * @param string $content The binary/text content to be indexed.
  1110. */
  1111. function index_content($id, $content) {
  1112. if ($content !== "") {
  1113. $this->add_xmltag( new xmltag("Id", $id) );
  1114. $content = preg_replace("/[\n\r\t]/", " ", $content);
  1115. $content = preg_replace("/[ ]{2,}/", " ", $content);
  1116. $this->add_field(DEFAULT_FIELD, $content);
  1117. }
  1118. } // index_content
  1119. // .....................................................................
  1120. /**
  1121. * Send the message to Lucene, and then post-process the response for
  1122. * indication of a successful index operation. We expect to receive
  1123. * a response back from Lucene which has our serialno in it. This method
  1124. * returns True if the indexing was successful, else False.
  1125. * @param integer $timeoutsecs Override for timeout in seconds
  1126. * @return boolean True if indexing was successful.
  1127. */
  1128. function send($timeoutsecs="") {
  1129. // Msg-level send-receive transaction..
  1130. lucene_message::send($timeoutsecs);
  1131.  
  1132. // Process the response to our request..
  1133. $this->response = new response_parser();
  1134. $this->response->parse($this->responsebuf);
  1135.  
  1136. // Unpack the response if no errors..
  1137. if ($this->response->valid) {
  1138. $this->indexed = ($this->response->serial == $this->serialno);
  1139. }
  1140. // Return status of indexing operation..
  1141. return $this->indexed;
  1142. } // send
  1143.  
  1144. } // lucene_indexmsg class
  1145. // ----------------------------------------------------------------------
  1146.  
  1147. /**
  1148. * The lucene unindex message class. This class allows you to remove an
  1149. * item from the Lucene index. You must know the unique ID that identifies
  1150. * the document.
  1151. * @package search
  1152. */
  1153. class lucene_unindexmsg extends lucene_message {
  1154. // .....................................................................
  1155. /** Constructor
  1156. * Make a new Lucene unindex message. This message is provided to allow
  1157. * you to delete an item from the Lucene index. It has a single method
  1158. * 'unindex' which takes the ID of the item to delete.
  1159. * @param string $application Optional application specifier
  1160. * @param string $host Hostname or IP of Lucene server
  1161. * @param string $port Port of Lucene server
  1162. */
  1163. function lucene_unindexmsg($application="?", $host="", $port="") {
  1164. $this->lucene_message("LuceneUnIndexRequest", $application, $host, $port);
  1165. } // lucene_unindexmsg
  1166. // .....................................................................
  1167. /**
  1168. * Unindex the given document, as identified by the unique ID. If no errors
  1169. * arise, then the item will be removed from the Lucene index.
  1170. * @param string $id The ID to allow Lucene to identify the item to unindex
  1171. */
  1172. function unindex($id) {
  1173. $this->add_xmltag( new xmltag("Id", $id) );
  1174. } // unindex
  1175.  
  1176. } // lucene_unindexmsg class
  1177. // ----------------------------------------------------------------------
  1178.  
  1179. /**
  1180. * The lucene purge message class. This class allows you to remove all
  1181. * items from the Lucene index. Take care!
  1182. * @package search
  1183. */
  1184. class lucene_purgemsg extends lucene_message {
  1185. // .....................................................................
  1186. /** Constructor
  1187. * Make a new Lucene purge message. This message is provided to allow
  1188. * you to delete all items from the Lucene index.
  1189. * @param string $application Optional application specifier
  1190. * @param string $host Hostname or IP of Lucene server
  1191. * @param string $port Port of Lucene server
  1192. */
  1193. function lucene_purgemsg($application="?", $host="", $port="") {
  1194. $this->lucene_message("LuceneUnIndexRequest", $application, $host, $port);
  1195. $this->add_xmltag( new xmltag("Purge") );
  1196. } // lucene_purgemsg
  1197.  
  1198. } // lucene_purgemsg class
  1199. // ----------------------------------------------------------------------
  1200.  
  1201. /**
  1202. * The lucene utility message class. Used for special Lucene operations.
  1203. * @package search
  1204. */
  1205. class lucene_utilitymsg extends lucene_message {
  1206. /** Constructor
  1207. * @param string $utilitycmd Command for this utility message.
  1208. * @param string $application Optional application specifier
  1209. * @param string $host Hostname or IP of Lucene server
  1210. * @param string $port Port of Lucene server
  1211. */
  1212. function lucene_utilitymsg($utilitycmd="", $application="?", $host="", $port="") {
  1213. $this->lucene_message("LuceneUtilityRequest", $application, $host, $port);
  1214. if ($utilitycmd != "") {
  1215. $this->add_xmltag( new xmltag("Utility", $utilitycmd) );
  1216. }
  1217. } // lucene_utilitymsg
  1218. // .....................................................................
  1219. /**
  1220. * Send the message to Lucene, and then post-process the response for
  1221. * indication of a successful utility operation. We expect to receive
  1222. * a response back from Lucene which has nothing much it, unless there
  1223. * has been an error.
  1224. * returns True if the operation was successful, else False.
  1225. * @param integer $timeoutsecs Override for timeout in seconds
  1226. * @return boolean True if operation was successful.
  1227. */
  1228. function send($timeoutsecs="") {
  1229. // Msg-level send-receive transaction..
  1230. lucene_message::send($timeoutsecs);
  1231.  
  1232. // Process the response to our request..
  1233. $this->response = new response_parser();
  1234. $this->response->parse($this->responsebuf);
  1235.  
  1236. // Return status of indexing operation..
  1237. return $this->response->valid;
  1238. } // send
  1239.  
  1240. } // lucene_utilitymsg class
  1241. // ----------------------------------------------------------------------
  1242.  
  1243. /**
  1244. * The lucene search class
  1245. * This class inherits the functionality of the generic 'search' class. It
  1246. * extends it to implement a LUCENE search. Use the methods in this class
  1247. * as the mainstay in implementing queries of content from Lucene. Most
  1248. * methods, such as match(), matchfield(), matchrange() etc. store the
  1249. * requirement in the class for subsequent building using the set_*()
  1250. * methods of the lucene classes to set the relevant fields. This is only
  1251. * done when you call execute(), and the query is built from all the
  1252. * composite terms you have added via match() et al.
  1253. * @package search
  1254. */
  1255. class lucene_search extends lucene_querymsg {
  1256. // .....................................................................
  1257. /**
  1258. * Constructor
  1259. * Create a new lucene search
  1260. * @param string $application Application name/domain name for searching in
  1261. * @param string $host Hostname or IP of Lucene server
  1262. * @param string $port Port of Lucene server
  1263. */
  1264. function lucene_search($application="?", $host="", $port="") {
  1265. global $RESPONSE;
  1266. $this->search();
  1267. $this->lucene_querymsg($application, $host, $port);
  1268. $this->initialise();
  1269. // Partitioned indexing for microsites..
  1270. if (isset($RESPONSE) && $RESPONSE->microsites_mode == MICROSITES_ENABLED) {
  1271. $site = (isset($RESPONSE->microsite_detected)) ? $RESPONSE->microsite_detected : APP_NAME;
  1272. $this->must_match("site:$site");
  1273. }
  1274. } // lucene_search
  1275. // .....................................................................
  1276. /**
  1277. * Add a new search term to match. Search terms can be a single word or
  1278. * compound patterns, Each time one of these is added, it has an operator
  1279. * associated with it - whether this term is a "may have" (OR), or a
  1280. * "must have" (AND) term.
  1281. * NB: This method overrides the parent method in order to ensure that all
  1282. * boolean logic terms are in upper case as Lucene requires.
  1283. * @param string $term Search term text to match.
  1284. * @param integer $op Joining operator: 'AND', 'OR', 'NOT, 'AND NOT'.
  1285. * @param string $id An optional ID to associate with this search term.
  1286. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1287. */
  1288. function match($term, $op="OR", $id="", $boost="") {
  1289. $LCops = array("/ and /","/ or /","/ not /");
  1290. $UCops = array(" AND "," OR "," NOT ");
  1291. $term = preg_replace($LCops, $UCops, $term);
  1292. if ($boost != "") $term .= "^$boost";
  1293. search::match($term, strtoupper($op), $id);
  1294. } // match
  1295. // .....................................................................
  1296. /**
  1297. * Add search term to match a field value.
  1298. * This is used to add a search term which defines the value that a given
  1299. * field may or may not contain for the search to succeed.
  1300. * For adding terms which are 'free' (as a user might type into a search
  1301. * box for example) then you can use the match() method which this class
  1302. * inherits from the search class.
  1303. * @param string $fieldname Name of field to reference in the index
  1304. * @param mixed $fieldvalue Value or array of values, for field to match
  1305. * @param string $op Operator to join this term to others in the query
  1306. * @param string $id Optional identity tag for this term
  1307. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1308. */
  1309. function matchfield($fieldname, $fieldvalue, $op="OR", $id="", $boost="") {
  1310. debug_trace($this);
  1311. if (!isset($fieldvalue)) return;
  1312. if (!is_array ($fieldvalue)) {
  1313. $fieldvalue = array($fieldvalue);
  1314. }
  1315. $term = "";
  1316. foreach ($fieldvalue as $value) {
  1317. $value = trim($value);
  1318. if ($value != "") {
  1319. $term .= " OR " . $this->fieldterm($fieldname, $value);
  1320. }
  1321. }
  1322. if ($term != "") {
  1323. $term = substr($term, 4); // Get rid of initial OR
  1324. // Call parent function to register the search term..
  1325. $this->match($term, strtoupper($op), $id, $boost);
  1326. }
  1327. debug_trace();
  1328. } // matchfield
  1329. // .....................................................................
  1330. /**
  1331. * Helper function to build field search term
  1332. * @param string $fieldname Name of field to reference in the index
  1333. * @param string $fieldvalue Value of field to match
  1334. * @access private
  1335. */
  1336. function fieldterm($fieldname, $fieldvalue) {
  1337. if ($fieldname != DEFAULT_FIELD) {
  1338. $term = "$fieldname:$fieldvalue";
  1339. }
  1340. else {
  1341. $term = $fieldvalue;
  1342. }
  1343. return $term;
  1344. } // fieldterm
  1345. // .....................................................................
  1346. /**
  1347. * Add search term to match a field value range.
  1348. * This is used to add a search term which defines the range of values that
  1349. * a given field may or may not contain for the search to succeed.
  1350. * NB: This method is always a must match (implied AND) search term. In
  1351. * other words the search is always restricted/refined by it.
  1352. * @param string $fromvalue Lower range value of field to match
  1353. * @param string $tovalue Upper range value of field to match
  1354. * @param string $fieldname Name of field, defaulted to 'Text'
  1355. */
  1356. function matchrange($fromvalue, $tovalue, $fieldname) {
  1357. debug_trace($this);
  1358. $this->set_range($fromvalue, $tovalue, $fieldname);
  1359. debug_trace();
  1360. } // matchrange
  1361. // .....................................................................
  1362. /**
  1363. * Add search term: must match a field value.
  1364. * This is used to add a search term which defines the value that a given
  1365. * field must contain for the search to succeed.
  1366. * @param string $fieldname Name of field
  1367. * @param string $fieldvalue Value of field to match
  1368. * @param string $id Optional identity tag for this term
  1369. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1370. */
  1371. function must_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
  1372. $this->matchfield($fieldname, $fieldvalue, "AND", $id, $boost);
  1373. } // must_matchfield
  1374. // .....................................................................
  1375. /**
  1376. * Add search term: may match a field value.
  1377. * This is used to add a search term which defines the value that a given
  1378. * field may contain for the search to succeed.
  1379. * @param string $fieldname Name of field
  1380. * @param string $fieldvalue Value of field to match
  1381. * @param string $id Optional identity tag for this term
  1382. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1383. */
  1384. function may_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
  1385. $this->matchfield($fieldname, $fieldvalue, "OR", $id, $boost);
  1386. } // may_matchfield
  1387. // .....................................................................
  1388. /**
  1389. * Add search term: must not match a field value.
  1390. * This is used to add a search term which defines the value that a given
  1391. * field must not contain for the search to succeed.
  1392. * @param string $fieldname Name of field
  1393. * @param string $fieldvalue Value of field to match
  1394. * @param string $id Optional identity tag for this term
  1395. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1396. */
  1397. function does_not_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
  1398. $this->matchfield($fieldname, $fieldvalue, "NOT", $id, $boost);
  1399. } // does_not_matchfield
  1400. // .....................................................................
  1401. /**
  1402. * Execute the search
  1403. * Here we execute a lucene search, overriding the method in the parent
  1404. * class. This involves building the query string, sending it to the
  1405. * Lucene server, and receiving the search results back.
  1406. * @param integer $timeoutsecs Override for timeout in seconds
  1407. */
  1408. function execute($timeoutsecs="") {
  1409. debug_trace($this);
  1410.  
  1411. // The queryvalid() method is in the parent class 'search', and
  1412. // calls the build() method in the same class. The build() method is
  1413. // a raw routine to join together the search terms with ANDs and
  1414. // ORs. You may have to override it for Lucene. If so, just create
  1415. // a new build() method in this class.
  1416.  
  1417. if ($this->queryvalid()) {
  1418.  
  1419. // Define the query string..
  1420. $this->set_query($this->query);
  1421.  
  1422. // Set limit, offset..
  1423. if ($this->max_results > 0) {
  1424. $this->set_limit($this->max_results);
  1425. if ($this->skip_results > 0) {
  1426. $this->set_first($this->skip_results);
  1427. }
  1428. }
  1429.  
  1430. // Set any daterange..
  1431. if ($this->has_daterange()) {
  1432. $this->set_range($this->date_start, $this->date_end, $this->date_fieldname);
  1433. }
  1434.  
  1435. // Send to Lucene..
  1436. $this->send($timeoutsecs);
  1437.  
  1438. // Flag that we did it..
  1439. $this->executed = true;
  1440. debugbr("lucene search: exec ok: returning " . $this->hitcount() . " hits");
  1441. }
  1442. else {
  1443. debugbr("lucene search: invalid query: '$this->query'");
  1444. }
  1445. debug_trace();
  1446. } // execute
  1447.  
  1448. } // lucene_search class
  1449. // ----------------------------------------------------------------------
  1450.  
  1451. /**
  1452. * The lucene file indexer class.
  1453. * This class indexes files on disc, either one by one or as a whole
  1454. * file hierarchy tree.
  1455. * @package search
  1456. */
  1457. class lucene_fileindexer {
  1458. // Public
  1459. /** Application we are indexing for */
  1460.  
  1461. var $application = "";
  1462. /** Host to connect to */
  1463.  
  1464. var $host = "";
  1465. /** Port to connect to */
  1466.  
  1467. var $port = "";
  1468.  
  1469. // Private
  1470. /** The index ID
  1471. @access private */
  1472. var $ixid;
  1473. /** ID generation source
  1474. @access private */
  1475. var $idsource = ID_FROM_INC;
  1476. /** Scan for meta tags as fields in file content. Recommended.
  1477. @access private */
  1478. var $metascan = true;
  1479. /** Meta fields definitions array. Contains definitions
  1480. for the fields we will process if found as meta tags.
  1481. @access private */
  1482. var $meta_fields = array();
  1483. /** Index fields definitions array. Contains definitions
  1484. for the fields we are expecting to index.
  1485. @access private */
  1486. var $field_definitions = array();
  1487. /** Fields for indexing. This is an array of fieldname/value
  1488. pairs which should be added during the indexing. These
  1489. fields do not have to appear in $field_definitions.
  1490. @access private */
  1491. var $indexfields = array();
  1492. /** ID generation offset
  1493. @access private */
  1494. var $idoffset = 0;
  1495. /** ID generation prefix
  1496. @access private */
  1497. var $idprefix = "";
  1498. /** The index object which does the work
  1499. @access private */
  1500. var $lucene_indexer;
  1501. /** Timeout for indexing commands in seconds (can usually leave
  1502. as nullstring)
  1503. @access private */
  1504. var $timeoutsecs = "";
  1505. /** Path to a lockfile we should give way to. If this value
  1506. is not nullstring, then no indexing will be done while the
  1507. file exists. If lockfile_wait is > 0, then we only wait
  1508. this many seconds.
  1509. @access private */
  1510. var $lockfile = "";
  1511. /** Number of seconds to wait on a lockfile. If zero, wait forever.
  1512. @access private */
  1513. var $lockfile_wait_secs = 0;
  1514. /** Indexing execution timer
  1515. @access private */
  1516. var $timer;
  1517. // .....................................................................
  1518. /**
  1519. * Constructor
  1520. * Create a new lucene indexer
  1521. * @param string $application Application name
  1522. * @param string $host Hostname or IP of Lucene server
  1523. * @param string $port Port of Lucene server
  1524. */
  1525. function lucene_fileindexer($application="?", $host="", $port="") {
  1526. // Store for reference..
  1527. $this->application = $application;
  1528. $this->host = $host;
  1529. $this->port = $port;
  1530. $this->timer = new microtimer();
  1531. } // lucene_fileindexer
  1532. // .....................................................................
  1533. /**
  1534. * Define a field. We supply the name of the field, it's type (Text, Date
  1535. * or Id), and whether it should be stored by Lucene for later retreival in
  1536. * queries. For example you would not store the raw document/content as this
  1537. * is usually stored elsewhere.
  1538. * IMPORTANT NOTE: Fields defined here will automatically be included as
  1539. * meta fields.
  1540. * @see meta_fields()
  1541. * @param string $fieldname Name of the field to index
  1542. * @param string $type Type of field data: Text, Date or Id.
  1543. * @param boolean $stored If true then Lucene will store the content itself
  1544. * @param boolean $indexed If true then Lucene will index the field content
  1545. */
  1546. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  1547. $this->field_definitions[$fieldname]
  1548. = $type . "|" . (($stored) ? "true" : "false") . "|" . (($indexed) ? "true" : "false");
  1549. // Register for meta tags..
  1550. $this->meta_field($fieldname, $type);
  1551. } // define_field
  1552. // .....................................................................
  1553. /**
  1554. * Define a lockfile which we must avoid during indexing. If defined
  1555. * then no indexing will take place while the lockfile exists. The
  1556. * second parameter allows you to specify a limit to the patience of
  1557. * this process, in seconds. Zero means wait forever.
  1558. * @param string $lockfile Path to the lockfile. Nullstring = not defined
  1559. * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
  1560. */
  1561. function avoid_lockfile($lockfile, $wait_secs=0) {
  1562. $this->lockfile = $lockfile;
  1563. $this->lockfile_wait_secs = $wait_secs;
  1564. } // avoid_lockfile
  1565. // .....................................................................
  1566. /**
  1567. * Define a field as a meta tag. This ensures that the field will be
  1568. * picked up from the file meta tags, if present. If it is not listed
  1569. * here then it will be ignored.
  1570. * IMPORTANT NOTE: We define the strict rule that ONLY fields which have
  1571. * been defined here can be added to the indexing via the meta tag scanning.
  1572. * Ie. you must define fields here explicitly, or via the define_field()
  1573. * method, or they will be ignored even if they turn up as a meta tag.
  1574. * This is so we can restrict the indexing, and be sure of field types.
  1575. * @see define_field()
  1576. * @param string $fieldname Name of the field to process as meta tag
  1577. * @param string $type Type of field data: Text, Date or Id.
  1578. */
  1579. function meta_field($fieldname, $type) {
  1580. $this->meta_fields[$fieldname] = $type;
  1581. } // meta_field
  1582. // .....................................................................
  1583. /**
  1584. * Supply field content for indexing. This causes Lucene to take the given
  1585. * fieldname and index the given value against it.
  1586. * The field name can have the field type included in the form 'Foo:Date',
  1587. * where 'Date' is the type in this instance. In fact, since 'Text' is the
  1588. * default filed type, 'Date' is probably the only one you need to use
  1589. * as the current implementation stands.
  1590. * @param string $fieldname Name of the field to index.
  1591. * @param string $fieldvalue Content of the field to index
  1592. */
  1593. function index_field($fieldname, $fieldvalue) {
  1594. $this->indexfields[$fieldname] = $fieldvalue;
  1595. } // index_field
  1596. // .....................................................................
  1597. /**
  1598. * Set the source for ID generation. Since we are indexing a bunch of
  1599. * files, the ID's have to be generated on demand inside the loop. So
  1600. * we provide for various ways here, and you can extend this class to
  1601. * provide more if required.
  1602. * Main ways:
  1603. * ID_FROM_INC Increment a counter by 1 each time (with offset)
  1604. * ID_FROM_NAME Take the filename, strip the extension, add prefix
  1605. * ID_FROM_FILENAME Take the full filename, add prefix
  1606. * ID_FROM_PATH Take the full file path
  1607. * NB: These are all defined as integer constants.
  1608. * @param integer $idsource Source of ID generation
  1609. * @param mixed $pfxofs String prefix, or integer offset
  1610. */
  1611. function id_generate($idsource=ID_FROM_INC, $pfxofs="") {
  1612. $this->idsource = $idsource;
  1613. if ($pfxofs != "") {
  1614. if (is_string($pfxofs)) {
  1615. $this->idprefix = $pfxofs;
  1616. }
  1617. else {
  1618. $this->idoffset = (int)$pfxofs;
  1619. }
  1620. }
  1621. } // id_generate
  1622. // .....................................................................
  1623. /**
  1624. * Flag that we should do a tag scan on the content of the files to try
  1625. * and extract fields to index. Note that any tags thus found will only
  1626. * be used if the field name has been defined with the method define_field();
  1627. * This causes both the <title> tag and <meta> tags to be considered.
  1628. * @see lucene_fileindexer::define_field()
  1629. */
  1630. function scantags() {
  1631. $this->metascan = true;
  1632. } // scantags
  1633. // .....................................................................
  1634. /**
  1635. * Flag that we should NOT do a tag scan on the content of the files.
  1636. */
  1637. function noscantags() {
  1638. $this->metascan = false;
  1639. } // noscantags
  1640. // .....................................................................
  1641. /**
  1642. * Index a file located at the given path, using given ID.
  1643. * You can also use the parameter $fields to supply an array of
  1644. * fieldname/value pairs to index with this file, for one-off indexing of
  1645. * files. If the fieldname is a date field, make sure to define the
  1646. * name as 'Foo:Date', to cause the field definition to be correct.
  1647. * @param string $path Path to the head of the file tree to index
  1648. * @param string $id ID to associate with the indexed file content
  1649. * @param mixed $fields Array of field/values to index with file
  1650. */
  1651. function index_file($path, $id, $fields=false) {
  1652. $success = false;
  1653. $f = new inputfile($path);
  1654. if ($f->opened) {
  1655. $f->readall();
  1656. $f->closefile();
  1657.  
  1658. // Wait for a lockfile, if we really have to..
  1659. if ($this->lockfile != "" && file_exists($this->lockfile)) {
  1660. $waitforit = true;
  1661. debugbr("waiting for lockfile..", DBG_DEBUG);
  1662. if ($this->lockfile_wait_secs > 0) {
  1663. $locktimer = new microtimer();
  1664. $locktimer->start();
  1665. }
  1666. do {
  1667. clearstatcache();
  1668. if (!file_exists($this->lockfile)) {
  1669. $waitforit = false;
  1670. debugbr("lockfile has been removed..", DBG_DEBUG);
  1671. }
  1672. elseif ($this->lockfile_wait_secs > 0 && $locktimer->secs() >= $this->lockfile_wait_secs) {
  1673. $waitforit = false;
  1674. debugbr("lockfile wait (" . $this->lockfile_wait_secs ."secs) timed out..", DBG_DEBUG);
  1675. }
  1676. else {
  1677. sleep(1);
  1678. }
  1679. } while ($waitforit === true);
  1680. }
  1681.  
  1682. // Create the index message..
  1683. $ix = new lucene_indexmsg($this->application, $this->host, $this->port);
  1684.  
  1685. // Define the fields for the index message..
  1686. foreach ($this->field_definitions as $fieldname => $attributes) {
  1687. $bits = explode("|", $attributes);
  1688. $type = $bits[0];
  1689. $stored = (strcasecmp($bits[1], "true") == 0);
  1690. $indexed = (strcasecmp($bits[2], "true") == 0);
  1691. $ix->define_field($fieldname, $type, $stored, $indexed);
  1692. }
  1693.  
  1694. // Scan file content for meta tags for index fields..
  1695. $content = preg_replace("/[\xe2][\x80]./", "", $f->content);
  1696. $content = preg_replace("/[\xc2][\xb7]./", "", $content);
  1697. $content = preg_replace("/[\xc2]&/", " ", $content);
  1698. $content = preg_replace("/[\xc3]&/", " ", $content);
  1699.  
  1700. if ($this->metascan) {
  1701. $tagpat = "/<meta name=\"(.*?)\" content=\"(.*?)\">/i";
  1702. $matches = array();
  1703. if (preg_match_all($tagpat, $content, $matches)) {
  1704. for ($i=0; $i < count($matches[0]); $i++) {
  1705. $fieldname = $matches[1][$i];
  1706. $fieldvalue = $matches[2][$i];
  1707. if (isset($this->meta_fields[$fieldname])) {
  1708. // Get type..
  1709. $type = $this->meta_fields[$fieldname];
  1710. if (!strcasecmp($type, "date")) {
  1711. // Newsquest date field format requires stripping off a prefix
  1712. // 'DT' - a temporary hack which should be completely transparent
  1713. // to everyone else using this. NB: originally NewsQuest only
  1714. // stored date in 'DTdd/mm/yyyy' format. This parsing is also
  1715. // compatible with the new 'DTdd/mm/yyyy hh:mm[:ss]' format.
  1716. if (substr($fieldvalue, 0, 2) == "DT") {
  1717. $fieldvalue = substr($fieldvalue, 2);
  1718. }
  1719. // Need to convert to Unix timestamp..
  1720. $ts = displaydate_to_timestamp($fieldvalue);
  1721. $fieldvalue = $ts;
  1722. }
  1723. debugbr("meta tag index field: $fieldname=$fieldvalue");
  1724. $ix->index_field($fieldname, $fieldvalue);
  1725. }
  1726. else {
  1727. debugbr("rejected unlisted tag field: $fieldname");
  1728. }
  1729. }
  1730. }
  1731. // Check for title tag in HTML page if required field..
  1732. if (preg_match("/<(title)>(.*?)<\/title>/i", $content, $matches)) {
  1733. $fieldname = $matches[1];
  1734. $fieldvalue = $matches[2];
  1735. if (isset($this->meta_fields[$fieldname])) {
  1736. $type = $this->meta_fields[$fieldname];
  1737. debugbr("title tag index field: $fieldname=$fieldvalue");
  1738. $ix->index_field($fieldname, $fieldvalue);
  1739. }
  1740. }
  1741. } // metascan
  1742.  
  1743. // Deal with passed-in field settings. These are meant to cater
  1744. // for indexing of individual files using this method. We just
  1745. // add them to any existing field/values already set up..
  1746. if ($fields) {
  1747. reset($fields);
  1748. while (list($fieldname, $fieldvalue) = each($fields)) {
  1749. $this->index_field($fieldname, $fieldvalue);
  1750. }
  1751. }
  1752.  
  1753. // Process field/value pairs which have been added either by the
  1754. // index_field() method, or passed in via the $fields parameter..
  1755. if (count($this->indexfields) > 0) {
  1756. reset($this->indexfields);
  1757. while (list($fieldname, $fieldvalue) = each($this->indexfields)) {
  1758. $bits = explode(":", $fieldname);
  1759. $type = ((isset($bits[1])) ? $bits[1] : "Text");
  1760. $fieldname = $bits[0];
  1761. debugbr("index field: $fieldname=$fieldvalue");
  1762. $ix->define_field($fieldname, $type);
  1763. $ix->index_field($fieldname, $fieldvalue);
  1764. }
  1765. }
  1766.  
  1767. // Index the file content. We get rid of any HTML tags..
  1768. debugbr("indexing file: $path, ID=$id");
  1769. $ix->index_content($id, strip_tags($content));
  1770.  
  1771. // Send the index message to lucene. We specify a large
  1772. // timeout since we really want this to succeed and Lucene
  1773. // may be in an optimization fugue..
  1774. $success = $ix->send(120);
  1775. if(!$success) {
  1776. debugbr("failed: $ix->error_msg");
  1777. }
  1778. }
  1779. else {
  1780. debugbr("open failed on '$path'");
  1781. }
  1782. return $success;
  1783. } // index_file
  1784. // .....................................................................
  1785. /**
  1786. * Index a tree of files starting at the path given. We index these in one
  1787. * of four modes, which determines how we generate the ID for each item:
  1788. * 'ID_FROM_INC' mode uses an incremental counter starting at 1. If $prefix
  1789. * holds a number, the counter will start at this number instead of one.
  1790. * Each item has an ID incremented by one from the last one.
  1791. * 'ID_FROM_NAME' mode uses the filename, stripped of any path and extension
  1792. * as the ID. If prefix is not a nullstring, then it is prefixed to every
  1793. * filename ID.
  1794. * 'ID_FROM_FILENAME' mode uses the filename, including any extension
  1795. * as the ID. If prefix is not a nullstring, then it is prefixed to every
  1796. * filename ID.
  1797. * 'ID_FROM_PATH' mode uses the full path to the item being indexed as the
  1798. * ID. If prefix is not a nullstring, then it is prefixed to every
  1799. * filename ID.
  1800. * The file will simply be indexed as a single Text field, with the
  1801. * appropriate ID, and no other index fields unless $metascan is set to TRUE.
  1802. * If this is the case, the system will scan the file for HTML meta tags of
  1803. * form: '<meta name="foo" content="bar">'. In this example a field of name
  1804. *'foo' would be given value 'bar'.
  1805. * @param string $path Path to the head of the file tree to index
  1806. * @param $patt Pattern to match, eg. '*.html'
  1807. * @param $restart If equal to "restart" then treat $path as file of paths
  1808. * @param $lockfile If path is set, we idle whilst this file exists
  1809. * @param string $lockfile Path to the lockfile. Nullstring = not defined
  1810. * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
  1811. */
  1812. function index_tree($path, $patt="", $restart="", $lockfile="", $wait_secs=0) {
  1813. // Set up any lockfile definition..
  1814. $this->avoid_lockfile($lockfile, $lockfile_wait_secs);
  1815.  
  1816. if ($restart == "restart") {
  1817. // Restart from existing paths file..
  1818. $tmpfname = $path;
  1819. debugbr("restarting with existing item list $path", DBG_DEBUG);
  1820. }
  1821. else {
  1822. // Use find to generate item list to a temporary file..
  1823. debugbr("generating item list", DBG_DEBUG);
  1824. $tmpfname = tempnam("/tmp", "LU");
  1825. $cmd = "find $path";
  1826. if ($patt != "") $cmd .= " -name \"$patt\"";
  1827. $cmd .= " >$tmpfname";
  1828. exec($cmd);
  1829. }
  1830. $treelist = new inputfile($tmpfname);
  1831. if ($treelist->opened) {
  1832. // Find the number of items..
  1833. debugbr("counting items", DBG_DEBUG);
  1834. $todo = (int) exec("cat $tmpfname|wc -l");
  1835. if ($todo > 0) {
  1836. $done = 0; $succeeded = 0; $failed = 0; $last = 0;
  1837. debugbr("$todo items to index", DBG_DEBUG);
  1838. $this->timer->start();
  1839. $idix = 0;
  1840. if ($this->idsource == ID_FROM_INC) {
  1841. $idix += $this->idoffset;
  1842. }
  1843.  
  1844. while ($path = $treelist->readln()) {
  1845. // Generate an ID to use..
  1846. switch ($this->idsource) {
  1847. case ID_FROM_INC:
  1848. // Use incremented index..
  1849. $id = $idix + 1;
  1850. $idix += 1;
  1851. break;
  1852.  
  1853. case ID_FROM_NAME:
  1854. // Use filename, minus extenaion..
  1855. $fname = basename($path);
  1856. if (strstr($fname, ".")) {
  1857. $bits = explode(".", $fname);
  1858. $dummy = array_pop($bits);
  1859. $fname = implode(".", $bits);
  1860. }
  1861. $id = $this->idprefix . $fname;
  1862. break;
  1863.  
  1864. case ID_FROM_FILENAME:
  1865. // Use full filename..
  1866. $id = $this->idprefix . basename($path);
  1867. break;
  1868.  
  1869. case ID_FROM_PATH:
  1870. // Use full file path..
  1871. $id = $this->idprefix . $path;
  1872. break;
  1873. } // switch
  1874.  
  1875. // Index the file with new ID..
  1876. if ($this->index_file($path, $id)) {
  1877. debugbr("$id indexed", DBG_DEBUG);
  1878. $succeeded += 1;
  1879. }
  1880. else {
  1881. debugbr("$path index failed", DBG_DEBUG);
  1882. //break;
  1883. $failed += 1;
  1884. }
  1885.  
  1886. // Progress check..
  1887. $done += 1;
  1888.  
  1889. // If the verbose output option is enabled, we compile
  1890. // stats and display these via the debugger..
  1891. if (debugging()) {
  1892. $pct = ($done / $todo) * 100;
  1893. $pct_int = (int)(floor($pct));
  1894. $pct_mod = $pct % 5;
  1895. if ($pct_mod == 0 && $pct_int > $last) {
  1896. $secperdoc = $this->timer->secs() / $done;
  1897. $timedone = $this->timer->formatted_time();
  1898. $timeleft = nicetime(($todo - $done) * $secperdoc);
  1899. $ms = $this->timer->millisecs();
  1900. $msper = number_format( ($ms / $done), 0);
  1901. debugbr("Mark: $pct_int% $timedone ($done) Rate:$msper" . "ms/item Left:$timeleft", DBG_DEBUG);
  1902. $last = $pct_int;
  1903. }
  1904. }
  1905. } // while
  1906.  
  1907. // Close tree list file..
  1908. $treelist->closefile();
  1909.  
  1910. // Wrap it up..
  1911. $this->timer->stop();
  1912.  
  1913. // Final stats if verbose mode..
  1914. if (debugging()) {
  1915. $secs = $this->timer->secs();
  1916. $msper = number_format( (1000 * $secs / $todo), 2);
  1917. $sper1000 = number_format( ($secs / $todo) * 1000, 2);
  1918. debugbr("time taken per item: " . $msper . "msec", DBG_DEBUG);
  1919. debugbr("time per 1000 items: " . nicetime($sper1000), DBG_DEBUG);
  1920. debugbr("total time taken: " . $this->timer->formatted_time(), DBG_DEBUG);
  1921. debugbr("successfully indexed: $succeeded", DBG_DEBUG);
  1922. debugbr("indexing failures: $failed", DBG_DEBUG);
  1923. }
  1924. }
  1925. else {
  1926. debugbr("nothing to index", DBG_DEBUG);
  1927. }
  1928. }
  1929. else {
  1930. debugbr("failed to open $tmpfname", DBG_DEBUG);
  1931. }
  1932. } // index_tree
  1933.  
  1934. } // lucene_fileindexer class
  1935. // ----------------------------------------------------------------------
  1936.  
  1937. /**
  1938. * Function to optimize the Lucene index. This would commonly
  1939. * be used after a batch of items have been indexed.
  1940. * @param string $application Application name/domain name for searching in
  1941. * @param string $host Hostname or IP of Lucene server
  1942. * @param string $port Port of Lucene server
  1943. * @return boolean True if the operation was successful.
  1944. */
  1945. function lucene_optimize($application="?", $host="", $port="") {
  1946. $optimizer = new lucene_utilitymsg("OPTIMIZE", $application, $host, $port);
  1947. $optimizer->send(SOCK_FOREVER);
  1948. return $optimizer->response->valid;
  1949. } // lucene_optimize
  1950. // ----------------------------------------------------------------------
  1951.  
  1952. /**
  1953. * Function to make a backup of the Lucene index. This would commonly
  1954. * be used after a batch of items have been successfully optimized (which
  1955. * indicates a sound index). The backup will be made to the directory
  1956. * specified in the application .properties file as the property
  1957. * 'Lucene-Backup-Directory=' or, if not there then in the Lucene properties
  1958. * file 'Server.properties' as the same property. If neither of these are
  1959. * defined, the server will attempt to use a sub-directory called
  1960. * {Lucene-Index-Directory}_backup, where {Lucene-Index-Directory} is the
  1961. * index path as already defined in the 'Server.properties' file.
  1962. * @param string $application Application name/domain name for searching in
  1963. * @param string $host Hostname or IP of Lucene server
  1964. * @param string $port Port of Lucene server
  1965. * @return boolean True if the operation was successful.
  1966. */
  1967. function lucene_backup($application="?", $host="", $port="") {
  1968. $backup = new lucene_utilitymsg("BACKUP", $application, $host, $port);
  1969. $backup->send(SOCK_FOREVER);
  1970. return $backup->response->valid;
  1971. } // lucene_backup
  1972. // ----------------------------------------------------------------------
  1973.  
  1974. /**
  1975. * Function to purge the Lucene index of all indexes to documents. Yes,
  1976. * I'll repeat that - it DELETES ALL DOCUMENTS FROM THE INDEX, permanently,
  1977. * finito, shazam, ba-boom, as in "Omigod did I *really* mean to do that!?".
  1978. * I guess I don't have to warn you to be careful with this, do I?
  1979. * @param string $application Application name/domain name for searching in
  1980. * @param string $host Hostname or IP of Lucene server
  1981. * @param string $port Port of Lucene server
  1982. * @return boolean True if the purging operation was successful.
  1983. */
  1984. function lucene_purge($application="?", $host="", $port="") {
  1985. $purgative = new lucene_purgemsg($application, $host, $port);
  1986. $purgative->send(SOCK_FOREVER);
  1987. return $purgative->response->valid;
  1988. } // lucene_purge
  1989. // ----------------------------------------------------------------------
  1990.  
  1991. ?>

Documentation generated by phpDocumentor 1.3.0RC3