Source for file lucene-defs.php

Documentation is available at lucene-defs.php

  1. <?php
  2. /* ******************************************************************** */
  3. /* CATALYST PHP Source Code */
  4. /* -------------------------------------------------------------------- */
  5. /* This program is free software; you can redistribute it and/or modify */
  6. /* it under the terms of the GNU General Public License as published by */
  7. /* the Free Software Foundation; either version 2 of the License, or */
  8. /* (at your option) any later version. */
  9. /* */
  10. /* This program is distributed in the hope that it will be useful, */
  11. /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
  12. /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
  13. /* GNU General Public License for more details. */
  14. /* */
  15. /* You should have received a copy of the GNU General Public License */
  16. /* along with this program; if not, write to: */
  17. /* The Free Software Foundation, Inc., 59 Temple Place, Suite 330, */
  18. /* Boston, MA 02111-1307 USA */
  19. /* -------------------------------------------------------------------- */
  20. /* */
  21. /* Filename: lucene-defs.php */
  22. /* Author: Paul Waite */
  23. /* Description: NB: This module is a variant of the original lucene */
  24. /* module which processed fields line-by-line. This module */
  25. /* implements the XML interface to Lucene. */
  26. /* */
  27. /* Definitions for interfacing to the LUCENE search */
  28. /* engine system. LUCENE is a system which is optimised */
  29. /* for indexing and searching in a generic way. It is */
  30. /* implemented as a server accessible via a port over TCP. */
  31. /* This module understands the protocol that this server */
  32. /* uses to implement indexing and search queries. */
  33. /* */
  34. /* THIS IS A DEPRECATED MODULE */
  35. /* (will be retained for back-compatibility) */
  36. /* */
  37. /* Instead include: */
  38. /* search-lucene-defs.php */
  39. /* - which will provide the underlying search engine */
  40. /* capabilities for the Catalyst Lucene Server Daemon. */
  41. /* */
  42. /* And for the high-level search/index functions include */
  43. /* the following, on an as-required basis: */
  44. /* search-query-defs.php */
  45. /* search-index-defs.php */
  46. /* search-fileindex-defs.php */
  47. /* - which provide all the high-level classes and methods */
  48. /* to search and index regardless of what the underlying */
  49. /* search engine actually is. */
  50. /* */
  51. /* CONVERSION TO NEW SCHEME: */
  52. /* To convert code from one to other scheme is relatively */
  53. /* pain-free. Where you are creating a 'lucene_search' */
  54. /* object, instead rename to 'searchengine_search'. Any */
  55. /* other methods used should just replace 'lucene_' with */
  56. /* 'searchengine_' where found. */
  57. /* */
  58. /* With indexing and unindexing, we have some new classes */
  59. /* 'searchengine_indexer' and 'searchengine_unindexer'. */
  60. /* Methods are as before except the final command to do */
  61. /* the job should now be execute() rather than send(). */
  62. /* */
  63. /* ******************************************************************** */
  64. /** @package search */
  65. include_once("search-defs.php");
  66. /** Stopwatch microtimer */
  67. ("timer-defs.php");
  68. /** XML classes */
  69. ("xml-defs.php");
  70.  
  71. // ----------------------------------------------------------------------
  72. /** Do not wait on socket receive, return immediately */
  73. ("SOCK_NO_WAIT", 0);
  74. /** Wait on socket forever (well, 24hrs is that, more or less) */
  75. ("SOCK_FOREVER", 86400);
  76. /** Times to retry timed-out socket sends/receives */
  77. ("SOCK_RETRIES", 3);
  78. /** Used to indicate that a field should be indexed by Lucene */
  79. ("INDEXED", true);
  80. /** Used to indicate that a field should NOT be indexed by Lucene */
  81. ("NOT_INDEXED", false);
  82. /** Used to indicate that a field should be stored by Lucene */
  83. ("STORED", true);
  84. /** Used to indicate that a field should NOT be stored by Lucene */
  85. ("NOT_STORED", false);
  86. /** The name of the field Lucene should assume if none specified */
  87. ("DEFAULT_FIELD", "Text");
  88. /** Default type of field: 'Text', 'Date', 'Id' */
  89. ("DEFAULT_FIELDTYPE", "Text");
  90. /** Mode of index ID generation is by incrementing integer */
  91. ("ID_FROM_INC", 0);
  92. /** Mode of index ID generation is by filename stripped of path and extension */
  93. ("ID_FROM_NAME", 1);
  94. /** Mode of index ID generation is by full filename (incl. extension) */
  95. ("ID_FROM_FILENAME", 2);
  96. /** Mode of index ID generation is by full path to file */
  97. ("ID_FROM_PATH", 3);
  98. /** Indicates index fields come from meta tag extraction */
  99. ("META_TAG_FIELDS", true);
  100.  
  101. // ----------------------------------------------------------------------
  102. /**
  103. * The lucene connection class
  104. * This class inherits the functionality of the 'search' class since mostly
  105. * that is what we will be connecting to Lucene for. The Indexing and
  106. * Control descendants can just ignore this inherited basic searching
  107. * functionality.
  108. * This class knows how to connect to a Lucene server and send and
  109. * receive messages to/from it. Child classes which need to talk to this
  110. * server to do indexing or querying should inherit this class.
  111. * @package search
  112. */
  113. class lucene_connection extends search {
  114. // Public
  115. /** HOST running the Lucene query server */
  116.  
  117. var $host = "";
  118. /** PORT that the server is listening on */
  119.  
  120. var $port = "";
  121. /** Timeout for send in seconds */
  122.  
  123. var $timeoutsecs = 10;
  124.  
  125. // Private
  126. /** Whether Lucene is enabled..
  127. @access private */
  128. var $enabled = true;
  129. /** The message waiting to be sent
  130. @access private */
  131. var $message = "";
  132. /** Raw response content we receive back from the Lucene server
  133. @access private */
  134. var $responsebuf = "";
  135. /** Socket file pointer
  136. @access private */
  137. var $sockfp = false;
  138. /** True if we are connected to socket
  139. @access private */
  140. var $connected = false;
  141. /** An execution timer
  142. @access private */
  143. var $timer;
  144. // .....................................................................
  145. /** Constructor - Lucene connection
  146. * @param string $host Hostname or IP of Lucene server
  147. * @param string $port Port of Lucene server
  148. * @param integer $timeoutsecs Seconds to timeout the connection
  149. */
  150. function lucene_connection($host="", $port="", $timeoutsecs="") {
  151. debugbr("Lucene connection: using XML interface v1.0");
  152. if ($host != "") {
  153. $this->connect($host, $port, $timeoutsecs);
  154. }
  155. $this->timer = new microtimer();
  156. } // lucene_connection
  157. // .....................................................................
  158. /**
  159. * Connect to the Lucene server. Optionally over-ride various settings
  160. * which were set in the constructor. Normally this method is only
  161. * called internally, in response to a request to send a message to
  162. * the Luceneserver.
  163. * @access private
  164. * @param string $host Hostname or IP of Lucene server
  165. * @param string $port Port of Lucene server
  166. * @param integer $timeoutsecs Seconds to timeout the connection
  167. */
  168. function connect($host="", $port="", $timeoutsecs="") {
  169. // Override host and port if given..
  170. if ($host != "") $this->host = $host;
  171. if ($port != "") $this->port = $port;
  172.  
  173. // Utilise the Axyl configuration settings, if available..
  174. if (class_exists("configuration")) {
  175. $config = new configuration("sys_control");
  176. // This controls whether we have Lucene capability or not..
  177. if ($config->field_exists("Lucene Site Indexing")) {
  178. $this->enabled = $config->value("Lucene Site Indexing");
  179. }
  180. // Only set host & port if they have not been given yet..
  181. if ($this->host == "") {
  182. $this->host = $config->value("Lucene Host");
  183. $this->port = $config->value("Lucene Port");
  184. debugbr("acquired Axyl config: host=$this->host, port=$this->port");
  185. }
  186. }
  187. // Try to open socket if we have a host..
  188. $this->connected = false;
  189. if ($this->enabled && $this->host != "") {
  190. $this->sockfp = fsockopen($this->host, $this->port);
  191. if(!$this->sockfp) {
  192. $this->log_error("failed to connect to '$this->host:$this->port'");
  193. }
  194. else {
  195. if ($timeoutsecs != "") $this->timeoutsecs = $timeoutsecs;
  196. $this->set_timeout($this->timeoutsecs);
  197. $this->connected = true;
  198. debugbr("lucene_connection: connected to '$this->host:$this->port'");
  199. }
  200. }
  201. // Return result..
  202. return $this->connected;
  203. } // connect
  204. // .....................................................................
  205. /**
  206. * Disconnect from the Lucene server. Normally this is used only by
  207. * internal Luceneserver methods.
  208. * @access private
  209. */
  210. function disconnect() {
  211. if ($this->connected) {
  212. fclose($this->sockfp);
  213. $this->sockfp = false;
  214. }
  215. } // disconnect
  216. // .....................................................................
  217. /**
  218. * Set the socket timeout. Deals with the special case of setting
  219. * the socket to non-blocking mode (zero timeout)..
  220. * @param integer $timeoutsecs Set the timeout in seconds
  221. */
  222. function set_timeout($timeoutsecs) {
  223. if ($this->connected && $timeoutsecs != "") {
  224. $this->timeoutsecs = $timeoutsecs;
  225. if ($this->timeoutsecs != SOCK_NO_WAIT) {
  226. socket_set_timeout( $this->sockfp, $this->timeoutsecs);
  227. }
  228. socket_set_blocking( $this->sockfp, (($this->timeoutsecs == SOCK_NO_WAIT) ? false : true) );
  229. }
  230. } // set_timeout
  231. // .....................................................................
  232. /**
  233. * Sends a message to the Lucene server, and receives the response. We
  234. * operate on the understanding that every time we send something to
  235. * Lucene we expect a response. Since this method already calls the
  236. * recieve() method, there is no need to call it from your application.
  237. * The content to be sent is expected to be already in the class
  238. * string variable $message. The response is put into $response which
  239. * is an array of LF-delimited lines sent back.
  240. * @param integer $timeoutsecs Override for timeout in seconds
  241. * @return boolean True if the message was sent ok
  242. */
  243. function send($timeoutsecs="") {
  244. $send_ok = true;
  245. $this->response = array();
  246. if (!$this->connected) {
  247. $this->connect();
  248. }
  249. if ($this->connected) {
  250. // Check for timeout over-ride..
  251. if ($timeoutsecs != "") $this->timeoutsecs = $timeoutsecs;
  252. $this->set_timeout($this->timeoutsecs);
  253. // Send message..
  254. if ($this->message != "") {
  255. $this->timer->restart();
  256. $bytesput = fputs($this->sockfp, $this->message);
  257. $this->timer->stop();
  258. if (debugging()) {
  259. $buf = trim(substr(rawurldecode($this->message),0, 5000));
  260. debugbr("<pre>" . xmldump($buf) . "</pre>", DBG_DUMP);
  261. debugbr("lucene_connection: send transaction took " . $this->timer->formatted_millisecs() . "mS");
  262. }
  263. if ($bytesput != -1) {
  264. debugbr("lucene_connection: send ok ($bytesput bytes)");
  265. for ($i=0; $i< SOCK_RETRIES; $i++) {
  266. $send_ok = $this->receive();
  267. if ($send_ok) break;
  268. debugbr("lucene_connection: receive retry #" . ($i + 1));
  269. }
  270. }
  271. else {
  272. $this->log_error("write to server failed");
  273. $send_ok = false;
  274. }
  275. }
  276. else {
  277. $this->log_error("trying to send null content");
  278. $send_ok = false;
  279. }
  280. }
  281. else {
  282. $this->log_error("send with no open socket");
  283. $send_ok = false;
  284. }
  285. // Return status..
  286. return $send_ok;
  287. } // send
  288. // .....................................................................
  289. /**
  290. * Receive a message from the Lucene server. We can specify a timeout
  291. * period in seconds. If set to SOCK_NO_WAIT, it will return immediately with or
  292. * without a message. This is a low-level routine which deals with receiving the
  293. * message over TCP sockets.
  294. * @return boolean True if the message was received loud and clear
  295. * @access private
  296. */
  297. function receive() {
  298. $received_ok = true;
  299. if ($this->connected) {
  300. $this->timer->restart();
  301. $this->responsebuf = "";
  302. while (!feof($this->sockfp)) {
  303. $buf = fread($this->sockfp, 10000);
  304. if ($buf !== false) {
  305. $this->responsebuf .= $buf;
  306. }
  307. else {
  308. $this->log_error("no response from server");
  309. $received_ok = false;
  310. break;
  311. }
  312. }
  313. $this->timer->stop();
  314. if (debugging()) {
  315. debugbr("<pre>" . xmldump($this->responsebuf) . "</pre>", DBG_DUMP);
  316. debugbr("lucene_connection: response from server took " . $this->timer->formatted_millisecs() . "mS");
  317. }
  318. }
  319. else {
  320. $this->log_error("receive with no open socket");
  321. $received_ok = false;
  322. }
  323. // Return status..
  324. return $received_ok;
  325. } // receive
  326. // .....................................................................
  327. /** Log a message to the syslog and print info to debugger.
  328. * @access private
  329. */
  330. function log_error($err) {
  331. $prefix = (defined("APP_NAME") ? APP_NAME . ": " : "");
  332. $err = "Lucene error: " . get_class($this) . ": $this->host:$this->port: $err";
  333. debugbr($err);
  334. error_log($prefix . $err, 0);
  335. } // log_error
  336.  
  337. } // lucene_connection class
  338. // ----------------------------------------------------------------------
  339.  
  340. /** The lucene fieldset class. This holds the Lucene fields for a lucene
  341. * message. These fields comprise the list of tags which make up
  342. * a query message or an index message.
  343. * @access private
  344. * @package search
  345. */
  346. class lucene_fieldset {
  347. /** Fields stored as an array of XML <Field> tags */
  348.  
  349. var $xmltags = array();
  350. // .....................................................................
  351. /** Constructor */
  352.  
  353. function lucene_fieldset() { }
  354. // .....................................................................
  355. /**
  356. * Return a copy of the named field object from fieldset by name.
  357. * NOTES: This function will return a new field if it does not already
  358. * exist. In this case the field will not be stored until you use the
  359. * put() method to do so. Always returns a field object.
  360. * @param string $fieldname The name of the field to get
  361. * @return object An xmltag object for the field
  362. */
  363. function get_field($fieldname) {
  364. if (isset($this->xmltags[$fieldname])) {
  365. $field = $this->xmltags[$fieldname];
  366. }
  367. else {
  368. $field = new xmltag("Field");
  369. $field->setattribute("name", $fieldname);
  370. }
  371. return $field;
  372. } // get_field
  373. // .....................................................................
  374. /**
  375. * Puts the named field into fieldset, indexed by fieldname.
  376. * @param string $fieldname Unique name of the field in the set
  377. * @param object $field The field object to store
  378. */
  379. function put_field($fieldname, $field) {
  380. $this->xmltags[$fieldname] = $field;
  381. } // put_field
  382. // .....................................................................
  383. /** Define a field in the fieldset. Set the definition for a field
  384. * in this fieldset. If the field does not exist it is created and
  385. * its definition set. If it exists the definition is updated.
  386. * @param string $fieldname Name of the field
  387. * @param string $type Type of this field eg. "Date"
  388. * @param boolean $stored Whether field value should be stored by Lucene
  389. * @param boolean $indexed Whether field value should be indexed by Lucene
  390. */
  391. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  392. $field = $this->get_field($fieldname);
  393. $field->setattribute("type", $type);
  394. $field->setattribute("stored", ($stored ? "true" : "false"));
  395. $field->setattribute("indexed", ($indexed ? "true" : "false"));
  396. $this->put_field($fieldname, $field);
  397. } // define_field
  398. // .....................................................................
  399. /** Add a field to the fieldset.
  400. * @param string $fieldname Name of the field
  401. * @param string $fieldvalue Value to associate with this field
  402. */
  403. function add_field($fieldname, $fieldvalue="") {
  404. $field = $this->get_field($fieldname);
  405. $field->value = $fieldvalue;
  406. $this->put_field($fieldname, $field);
  407. } // add_field
  408. // .....................................................................
  409. /** Clear all fields from the fieldset */
  410.  
  411. function clear() {
  412. $this->xmltags = array();
  413. } // clear
  414. // .....................................................................
  415. function render() {
  416. $s = "";
  417. foreach ($this->xmltags as $field) {
  418. $s .= $field->render();
  419. }
  420. return $s;
  421. } // render
  422.  
  423. } // lucene_fieldset class
  424. // ----------------------------------------------------------------------
  425.  
  426. /**
  427. * The lucene msg class. This is a raw class which holds the basic
  428. * message fields and data and knows how to build them into a full
  429. * message for sending to the lucene server.
  430. * @package search
  431. */
  432. class lucene_msg extends lucene_connection {
  433. // Public
  434. /** Type/name of this message */
  435.  
  436. var $type = "";
  437.  
  438. // Private
  439. /** Array containing XML tags
  440. @access private */
  441. var $xmltags = array();
  442. /** Object containing lucene fields
  443. @access private */
  444. var $fieldset;
  445. /** True if message has been built
  446. @access private */
  447. var $built = false;
  448. /** Error message if any error occurred
  449. @access private */
  450. var $error_msg = "";
  451. // .....................................................................
  452. /** Constructor
  453. * Notes: The application is either specified in the formal paramters or it
  454. * can be determined for an Axyl application by using the APP_PREFIX which
  455. * is unique to the application. This is the recommended option. Other
  456. * developers have, however, also used the configvalue 'Lucene Application'
  457. * for some reason, so this is still supported here. If none of these
  458. * methods results in a valid identifier, 'default' is used.
  459. * @param string $type Type of message this is, eg; QUERY, INDEX..
  460. * @param string $application The application name. Sets default Lucene config.
  461. * @param string $host Hostname or IP of Lucene server
  462. * @param string $port Port of Lucene server
  463. */
  464. function lucene_msg($type="", $application="?", $host="", $port="") {
  465. $this->lucene_connection($host, $port);
  466. $this->type = $type;
  467. $this->fieldset = new lucene_fieldset();
  468. // We must have an application..
  469. if ($application == "?") {
  470. if (class_exists("configuration")) {
  471. $config = new configuration("sys_control");
  472. $application = $config->value("Lucene Application");
  473. }
  474. // Axyl configuration value may not be defined and
  475. // the APP_PREFIX will be used in this case..
  476. if ($application == "" || $application == "?") {
  477. if ( defined("APP_PREFIX")) {
  478. $application = APP_PREFIX;
  479. }
  480. else {
  481. // The default case for standalone apps..
  482. $application = "default";
  483. }
  484. }
  485. }
  486. // Set the application..
  487. $this->set_application($application);
  488. } // lucene_msg
  489. // .....................................................................
  490. /**
  491. * Add a new XML tag object to this Lucene message
  492. * @param object $tag Tha xmltag object to add to our lucene msg
  493. */
  494. function add_xmltag($tag) {
  495. $this->xmltags[] = $tag;
  496. $this->built = false;
  497. } // add_xmltag
  498. // .....................................................................
  499. /**
  500. * Specify the application. The application is the name of a configuration
  501. * set which has been specified either by a control message, or by using
  502. * configuration files on the server. A given configuration set identified
  503. * by an application name can have specific fields already defined, such
  504. * as Sort: or Domain: etc.
  505. * Notes: The 'Application' header can only appear once in the message.
  506. * @param string $application The application name to set.
  507. */
  508. function set_application($application) {
  509. $this->add_xmltag( new xmltag("Application", $application) );
  510. } // set_application
  511. // .....................................................................
  512. /**
  513. * Specify a domain. A domain is an identifier which groups indexed
  514. * objects internally to Lucene. This allows searches on multiple
  515. * archives of documents in a single Lucene installation.
  516. * Notes: There may be zero or more domain headers in the message. If it
  517. * does not appear, then any domain header defined for the application
  518. * will be applied on its own. Otherwise any definitions added by this
  519. * method are OR'd with any specified in the application config.
  520. * NB: If no domains are specified anywhere, any searching will be done
  521. * across all domains (which would probably yield very confusing return
  522. * data!).
  523. * @param string $domain The domain to set.
  524. */
  525. function set_domain($domain) {
  526. $this->add_xmltag( new xmltag("Domain", $domain) );
  527. } // set_domain
  528. // .....................................................................
  529. /** Add a field to the fieldset.
  530. * @param string $fieldname Name of the field
  531. * @param string $fieldvalue Value to associate with this field
  532. */
  533. function add_field($fieldname, $fieldvalue="") {
  534. $this->fieldset->add_field($fieldname, $fieldvalue);
  535. $this->built = false;
  536. } // add_field
  537. // .....................................................................
  538. /** Clear all data/fields, leaving type definition alone. */
  539.  
  540. function clear() {
  541. $this->fieldset->clear();
  542. $this->message = "";
  543. $this->built = false;
  544. } // clear
  545. // .....................................................................
  546. /**
  547. * Builds the message according to the message type. This method
  548. * may be over-ridden in children inheriting this class
  549. * @access private
  550. */
  551. function build() {
  552. if (!$this->built) {
  553. if ($this->type != "") {
  554. $xml = new xmltag($this->type);
  555. // XML TAGS
  556. foreach ($this->xmltags as $tag) {
  557. $xml->childtag($tag);
  558. }
  559. // FIELDS
  560. if (count($this->fieldset->xmltags) > 0) {
  561. $fields = new xmltag("Fields");
  562. foreach ($this->fieldset->xmltags as $field) {
  563. $fields->childtag($field);
  564. }
  565. $xml->childtag($fields);
  566. }
  567. $this->message = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $xml->render();
  568. $this->built = true;
  569. }
  570. }
  571. return $this->built;
  572. } // build
  573. // .....................................................................
  574. /**
  575. * Sends the current message to Lucene, and checks for protocol
  576. * errors in the received response.
  577. * @param integer $timeoutsecs Override for timeout in seconds
  578. */
  579. function send($timeoutsecs="") {
  580. if ($this->build()) {
  581. // Low-level socket send-receive transaction..
  582. lucene_connection::send($timeoutsecs);
  583. // Once a msg is sent, socket can be closed..
  584. $this->disconnect();
  585. }
  586. } // send
  587.  
  588. } // lucene_msg class
  589. // ----------------------------------------------------------------------
  590.  
  591. /**
  592. * The lucene message class. This class extends its parent class
  593. * lucene_msg and adds some higher level methods for adding groups of
  594. * fields to the message.
  595. * @package search
  596. */
  597. class lucene_message extends lucene_msg {
  598. /** Response object which will parse XML content
  599. @access private */
  600. var $response;
  601. // .....................................................................
  602. /** Constructor
  603. * This is a more complex class which builds on the basic lucene_msg
  604. * class to provide some higher level methods for adding fields in
  605. * specific ways to support CONTROL, QUERY and INDEX message types.
  606. * @param string $type Type of message this is, eg; QUERY, INDEX..
  607. * @param string $application The application name. Sets default Lucene config.
  608. * @param string $host Hostname or IP of Lucene server
  609. * @param string $port Port of Lucene server
  610. */
  611. function lucene_message($type="", $application="?", $host="", $port="") {
  612. $this->lucene_msg($type, $application, $host, $port);
  613. } // lucene_message
  614. // .....................................................................
  615. /**
  616. * Strip field type specifiers out of field strings. A field string with
  617. * a type specifier in it is of the form: 'Foo:Date', where the field
  618. * name is 'Foo' and the field type is 'Date'. Possible field types are
  619. * 'Id', 'Text' (the default), and 'Date'.
  620. * Note that sort field specification is a special case, where the syntax
  621. * can be 'Foo:Date:Desc' or 'Foo:Desc' indicating the sort on the given
  622. * field should be done in descending order.
  623. * At present you would only use this facility with a 'Date' field, and
  624. * everything else would then default to 'Text'. [The 'Id' type being a
  625. * special one]
  626. * We return the field stripped of any type, and if a type was present
  627. * we issue the define_field() directive to define it. A field so-defined
  628. * will always be both stored by Lucene and indexed.
  629. * @param string $field Field in 'Foo:Date' format, or just 'Foo' for default type
  630. * @return string The fieldname stripped of any type specifier
  631. * @access private
  632. */
  633. function strip_field_type($field) {
  634. $fieldname = $field;
  635. $retfieldname = $field;
  636. if (strstr($field, ":")) {
  637. // Extract field specifier parts..
  638. $bits = explode(":", $field);
  639. $fieldname = trim( array_shift($bits) );
  640. $retfieldname = $fieldname;
  641. $f1 = trim(array_shift($bits));
  642. $f2 = trim(array_shift($bits));
  643. // Check for a sort field with DESC specifier..
  644. if ($f1 == "Desc" || $f2 == "Desc") {
  645. $retfieldname .= ":Desc";
  646. }
  647. // Check for valid field type specifier..
  648. if ($f1 == "Date" || $f1 == "Text" || $f1 == "Id") {
  649. // Define field by name..
  650. $this->define_field($fieldname, $f1);
  651. }
  652. }
  653. // Return fieldname plus any sort spec..
  654. return $retfieldname;
  655. } // strip_field_type
  656. // .....................................................................
  657. /**
  658. * Define a field. We supply the name of the field, it's type (Text, Date
  659. * or Id), and whether it should be stored by Lucene for later retreival
  660. * in queries. For example you would not store the raw document/content as
  661. * this is usually stored elsewhere.
  662. * We also cater for fields which might not need to be indexed. These would
  663. * be fields of data you just want to return with the document, if found in
  664. * a query, but not search on. An example might be a field containing the
  665. * path to the physical document on disk. For these fields you would then
  666. * specify NOT_INDEXED for the $indexed parameter. These fields MUST be
  667. * stored, so we make the rule: if the field is NOT_INDEXED then it must
  668. * be STORED (this will be forced).
  669. * In the normal course of events, fields will be defined to be both stored
  670. * and indexed. The exception is the special "Text" field associated with
  671. * an item "Body", which is indexed, but never stored.
  672. * This method adds the field settings directly via the add_field() method.
  673. * @see add_field()
  674. * @param string $fieldname Name of the field to index
  675. * @param string $type Type of field data: Text, Date or Id.
  676. * @param boolean $stored If true then Lucene will store the content itself
  677. * @param boolean $indexed If true then Lucene will index the field content
  678. */
  679. function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
  680. // Force non-indexed fields to be stored..
  681. if ($indexed == NOT_INDEXED) $stored = STORED;
  682. $this->fieldset->define_field($fieldname, $type, $stored, $indexed);
  683. } // define_field
  684. // .....................................................................
  685. /**
  686. * Specify the fields you want returned from Lucene.
  687. * Fields should be in a comma-separated list of field names. Each field
  688. * name can have the field type included in the form 'Foo:Date', where
  689. * 'Date' is the type in this instance. In fact, since 'Text' is the
  690. * default filed type, 'Date' is probably the only one you need to use
  691. * as the current implementation stands.
  692. * This method adds the field setting directly via the add_field() method.
  693. * @see add_field
  694. * @param mixed $fields Comma-delimited fieldname list, or array of fields
  695. */
  696. function set_returnfields($fields) {
  697. if (!is_array($fields)) {
  698. $flds = explode(",", $fields);
  699. }
  700. else {
  701. $flds = $fields;
  702. }
  703. $returnfields = array();
  704. foreach ($flds as $field) {
  705. $returnfields[] = $this->strip_field_type($field);
  706. }
  707. $returnlist = implode(" ", $returnfields);
  708. $this->add_xmltag( new xmltag("Return", $returnlist) );
  709. } // set_returnfields
  710. // .....................................................................
  711. /**
  712. * Specify query limit field. This sets the maximum number of results
  713. * that Lucene should return.
  714. * @param integer $limit Maximum number of results (hits) to return
  715. */
  716. function set_limit($limit) {
  717. $this->add_xmltag( new xmltag("Limit", $limit) );
  718. } // set_limit
  719. // .....................................................................
  720. /**
  721. * Specify query offset field 'First'. This sets the offset for the
  722. * returned results. For example, if this was set to 3, and Lucene
  723. * found 20 hits, then results would be sent back from the 3rd hit
  724. * onwards.
  725. * @param integer $first Offset in result set to start from
  726. */
  727. function set_first($first) {
  728. $this->add_xmltag( new xmltag("First", $first) );
  729. } // set_first
  730. // .....................................................................
  731. /**
  732. * Specify the fields you want query results to be ordered by.
  733. * Fields should be in a comma-separated list of field names. Each field
  734. * name can have the field type included in the form 'Foo:Date', where
  735. * 'Date' is the type in this instance. In fact, since 'Text' is the
  736. * default filed type, 'Date' is probably the only one you need to use
  737. * as the current implementation stands.
  738. * Note that sort field specification is a special case, where the syntax
  739. * can be 'Foo:Date:Desc' or 'Foo:Desc' indicating the sort on the given
  740. * field should be done in descending order.
  741. * @param mixed $fields Comma-delimited fieldname list, or array of fields
  742. */
  743. function set_sortorder($fields) {
  744. if (!is_array($fields)) {
  745. $flds = explode(",", $fields);
  746. }
  747. else {
  748. $flds = $fields;
  749. }
  750. $sortfields = array();
  751. foreach ($flds as $field) {
  752. $sortfields[] = $this->strip_field_type($field);
  753. }
  754. // Create the field..
  755. $sortlist = implode(" ", $sortfields);
  756. $this->add_xmltag( new xmltag("Sort", $sortlist) );
  757. } // set_sortorder
  758. // .....................................................................
  759. /**
  760. * Specify a range on a field for querying. We specify the name of a field
  761. * which is used to select articles within the given limits, and
  762. * the limits themeselves. Either limit may be passed as nullstring
  763. * which indicates no limit on that side. Any dates must be passed as
  764. * standard Unix timestamps (seconds since 1970).
  765. * Notes: This method can be called multiple times to define additional
  766. * ranges for different field names.
  767. * This method adds the field setting directly via the add_field() method.
  768. * @see add_field
  769. * @param string $range_from Value of lowerbound range
  770. * @param string $range_to Value of upperbound range
  771. * @param string $range_fieldname Name of field to use in range query.
  772. */
  773. function set_range($range_from="", $range_to="", $range_fieldname="") {
  774. if ($range_fieldname != "") {
  775. $range = new xmltag("Range");
  776. $range->setattribute("field", $this->strip_field_type($range_fieldname));
  777. if ($range_from != "" && $range_from != false) {
  778. $range->childtag( new xmltag("From", $range_from) );
  779. }
  780. if ($range_to != "" && $range_to != false) {
  781. $range->childtag( new xmltag("To", $range_to) );
  782. }
  783. $this->add_xmltag( $range );
  784. }
  785. } // set_range
  786. // .....................................................................
  787. /**
  788. * Supply a stopword list to lucene.
  789. * This method adds the field setting directly via the add_field() method.
  790. * @see add_field
  791. * @param mixed $stopwords Space-delimited list, or array of stopwords
  792. */
  793. function set_stopwords($stopwords) {
  794. if (is_array($stopwords)) {
  795. $mystops = implode(" ", $stopwords);
  796. }
  797. else {
  798. $mystops = $stopwords;
  799. }
  800. $this->add_xmltag( new xmltag("Stop-List", $mystops) );
  801. } // set_stopwords
  802.  
  803. } // lucene_message class
  804. // ----------------------------------------------------------------------
  805.  
  806. /**
  807. * Encapsulation of the result of a generic search query. This is for
  808. * internal use only.
  809. * @package search
  810. * @access private
  811. */
  812. class queryresult {
  813. var $rank = "";
  814. var $fields = array();
  815.  
  816. function queryresult($rank="") {
  817. $this->rank = $rank;
  818. }
  819. function addfield($fieldname, $fieldvalue) {
  820. $this->fields[$fieldname] = $fieldvalue;
  821. }
  822. } // queryresult class
  823. // ----------------------------------------------------------------------
  824.  
  825. /**
  826. * Class comprising the functionality of a Lucene response parser. This
  827. * is for internal use only.
  828. * @package search
  829. * @access private
  830. */
  831. class response_parser extends xmlparser {
  832. /** Current/last tag opened */
  833.  
  834. var $tag = "";
  835. /** Attributes array for current/last tag */
  836.  
  837. var $attr = array();
  838. /** Serial transaction ID */
  839.  
  840. var $serial = "";
  841. /** Status message */
  842.  
  843. var $status_message = "";
  844. /** True if response was valid, ie. no errors */
  845.  
  846. var $valid = true;
  847. /** All cdata content for the response */
  848.  
  849. var $tag_data = array();
  850. // .....................................................................
  851. /** Construct a new parser. */
  852.  
  853. function response_parser() {
  854. $this->xmlparser();
  855. } // response_parser
  856. // .....................................................................
  857. /** Method invoked when a tag is opened */
  858.  
  859. function tag_open($parser, $tag, $attributes) {
  860. $this->tag = $tag;
  861. if (is_array($attributes) && count($attributes) > 0) {
  862. foreach ($attributes as $key => $value ) {
  863. $this->attr[$key] = $value;
  864. }
  865. }
  866. switch ($tag) {
  867. case "Error":
  868. $this->valid = false;
  869. break;
  870. } // switch
  871. } // tag_open
  872. // .....................................................................
  873. /**
  874. * Method invoked when character data is available. This is essentially
  875. * a field of data, with the name of the field being the tag-name, and
  876. * the value being the cdata itself. Here we cherry-pick a few 'special'
  877. * values and assign to class vars for easier access. The character
  878. * data is all stashed in the 'tag_data' array under the name of the tag
  879. * as well, so no fields are lost.
  880. */
  881. function cdata($parser, $cdata) {
  882. switch ($this->tag) {
  883. case "Error":
  884. $this->error_message = $cdata;
  885. debugbr("lucene error: $this->error_message");
  886. break;
  887. case "Status":
  888. $this->status_message = $cdata;
  889. debugbr("lucene status: $this->status_message");
  890. break;
  891. case "Serial":
  892. $this->serial = $cdata;
  893. break;
  894. } // switch
  895. // Record all tag data. Note that attributes, if any, are
  896. // not recorded at this point..
  897. $this->tag_data[$this->tag] = $cdata;
  898. } // cdata
  899. // .....................................................................
  900. /** Method invoked when a tag is closed */
  901.  
  902. function tag_close($parser, $tag) {
  903. $this->tag = "";
  904. $this->attr = array();
  905. } // tag_close
  906. // .....................................................................
  907. function parse($xml) {
  908. xmlparser::parse($xml);
  909. if (!$this->valid_xml) {
  910. $this->valid = false;
  911. }
  912. if ($this->error_message != "") {
  913. log_sys($this->error_message);
  914. }
  915. } // parse
  916.  
  917. } // response_parser class
  918. // ----------------------------------------------------------------------
  919.  
  920. /**
  921. * Class comprising the functionality of an XML parser for queries. This
  922. * is for internal use only.
  923. * @package search
  924. * @access private
  925. */
  926. class queryresponse_parser extends response_parser {
  927. /** Results returned count */
  928.  
  929. var $count = 0;
  930. var $results;
  931. var $results_stream = false;
  932. // .....................................................................
  933. /** Construct a new parser. */
  934.  
  935. function queryresponse_parser() {
  936. $this->response_parser();
  937. } // queryresponse_parser
  938. // .....................................................................
  939. /** Method invoked when a tag is opened */
  940.  
  941. function tag_open($parser, $tag, $attributes) {
  942. response_parser::tag_open($parser, $tag, $attributes);
  943. switch ($tag) {
  944. case "Results":
  945. $this->results_stream = true;
  946. break;
  947. case "Result":
  948. $this->addresult(
  949. $this->attr["counter"],
  950. $this->attr["rank"]
  951. );
  952. $this->attr = array();
  953. break;
  954. } // switch
  955. } // tag_open
  956. // .....................................................................
  957. /** Method invoked when character data is available */
  958.  
  959. function cdata($parser, $cdata) {
  960. response_parser::cdata($parser, $cdata);
  961. switch ($this->tag) {
  962. case "Count":
  963. $this->count = $cdata;
  964. break;
  965. case "Field":
  966. if ($this->results_stream) {
  967. if (count($this->attr) > 0) {
  968. $result = array_pop($this->results);
  969. $fieldname = $this->attr["name"];
  970. $fieldval = $cdata;
  971. $result->addfield($fieldname, $fieldval);
  972. array_push($this->results, $result);
  973. }
  974. $this->attr = array();
  975. }
  976. break;
  977. } // switch
  978. } // cdata
  979. // .....................................................................
  980. /** Method invoked when a tag is closed */
  981.  
  982. function tag_close($parser, $tag) {
  983. response_parser::tag_close($parser, $tag);
  984. switch ($tag) {
  985. case "Results":
  986. $this->results_stream = false;
  987. break;
  988. } // switch
  989. } // tag_close
  990. // .....................................................................
  991. /** Add a result field to the response */
  992.  
  993. function addresult($id, $rank) {
  994. $this->results[$id] = new queryresult($rank);
  995. } // addresult
  996.  
  997. } // queryresponse_parser class
  998. // ----------------------------------------------------------------------
  999.  
  1000. /**
  1001. * The lucene query message class. This class inherits all the functionality
  1002. * of the lucene_connection, lucene_msg and lucene_message classes. It adds
  1003. * query-specific methods for searching.
  1004. * @package search
  1005. */
  1006. class lucene_querymsg extends lucene_message {
  1007. /** Set to true if sort limit was exceeded in query */
  1008.  
  1009. var $sort_limit_exceeded = false;
  1010. /** Set to true if Lucene blew its memory trying to sort */
  1011.  
  1012. var $sort_memory_exceeded = false;
  1013. // .....................................................................
  1014. /** Constructor
  1015. * Make a new Lucene query message. You can specify the application to
  1016. * use here, and also an optional query string to send.
  1017. * @param string $application Optional application specifier.
  1018. * @param string $host Hostname or IP of Lucene server
  1019. * @param string $port Port of Lucene server
  1020. */
  1021. function lucene_querymsg($application="?", $host="", $port="") {
  1022. $this->lucene_message("LuceneQueryRequest", $application, $host, $port);
  1023. } // lucene_querymsg
  1024. // .....................................................................
  1025. /**
  1026. * Set the query for this message. There can be only one query defined.
  1027. * This method can be called repeatedly, and each time it is called the
  1028. * new value will replace the old one.
  1029. * @param string $query The query to submit to Lucene.
  1030. */
  1031. function set_query($query) {
  1032. $queryxml = new xmltag("Query", $query);
  1033. $queryxml->setattribute("default-field", DEFAULT_FIELD);
  1034. $this->add_xmltag($queryxml);
  1035. } // set_query
  1036. // .....................................................................
  1037. /**
  1038. * Send the message to Lucene, and then post-process the response for
  1039. * query hits. The hitcount is extracted, followed by the hits, which
  1040. * may comprise multiple fields. A hit is thus defined as an array of
  1041. * fields, and each hit is put into a single container array called
  1042. * 'hit', which is a property of the parent class 'search'.
  1043. * @param integer $timeoutsecs Override for timeout in seconds
  1044. */
  1045. function send($timeoutsecs="") {
  1046. // Initialise flags..
  1047. $this->sort_limit_exceeded = false;
  1048. $this->sort_memory_exceeded = false;
  1049.  
  1050. // Msg-level send-receive transaction..
  1051. lucene_message::send($timeoutsecs);
  1052.  
  1053. // Process the response to our request..
  1054. $this->response = new queryresponse_parser();
  1055. $this->response->parse($this->responsebuf);
  1056.  
  1057. // Unpack the response if no errors..
  1058. if ($this->response->valid) {
  1059. // Here we will unpack the returned search query hits
  1060. // and store them locally for use by child classes.
  1061. if (isset($this->response->results)) {
  1062. foreach ($this->response->results as $result) {
  1063. $hit = array();
  1064. $hit["RANK"] = $result->rank;
  1065. foreach ($result->fields as $fieldname => $fieldvalue) {
  1066. $hit[$fieldname] = $fieldvalue;
  1067. }
  1068. $this->hit[] = $hit;
  1069. }
  1070. }
  1071. }
  1072. else {
  1073. // Check for sort limit/memory error conditions..
  1074. if (stristr($this->response->error_message, "system sort limit")) {
  1075. $this->sort_limit_exceeded = true;
  1076. }
  1077. if (stristr($this->response->error_message, "out of memory")) {
  1078. $this->sort_memory_exceeded = true;
  1079. }
  1080. }
  1081. } // send
  1082.  
  1083. } // lucene_querymsg class
  1084. // ----------------------------------------------------------------------
  1085.  
  1086. /**
  1087. * The lucene index message class. This class inherits all the functionality
  1088. * of the lucene_connection, lucene_msg and lucene_message classes. It adds
  1089. * indexing-specific methods.
  1090. * @package search
  1091. */
  1092. class lucene_indexmsg extends lucene_message {
  1093. // Public
  1094. /** Indication that the indexing was successful */
  1095.  
  1096. var $indexed = false;
  1097.  
  1098. // Private
  1099. /** A unique handle to identify the index
  1100. response from Lucene
  1101. @access private */
  1102. var $serialno = "";
  1103. // .....................................................................
  1104. /** Constructor
  1105. * Make a new Lucene index message.
  1106. * @param string $application Optional application specifier
  1107. * @param string $host Hostname or IP of Lucene server
  1108. * @param string $port Port of Lucene server
  1109. */
  1110. function lucene_indexmsg($application="?", $host="", $port="") {
  1111. global $RESPONSE;
  1112. $this->lucene_message("LuceneIndexRequest", $application, $host, $port);
  1113. $this->serialno = md5(uniqid(""));
  1114. $this->add_xmltag( new xmltag("Serial", $this->serialno) );
  1115. $this->define_field(DEFAULT_FIELD, DEFAULT_FIELDTYPE, NOT_STORED);
  1116. // Partitioned indexing for microsites..
  1117. if (isset($RESPONSE) && $RESPONSE->microsites_mode == MICROSITES_ENABLED) {
  1118. $site = (isset($RESPONSE->microsite_detected)) ? $RESPONSE->microsite_detected : APP_NAME;
  1119. $this->define_field("site", "text", STORED, INDEXED);
  1120. $this->index_field("site", $site);
  1121. }
  1122. } // lucene_indexmsg
  1123. // .....................................................................
  1124. /**
  1125. * Supply field content for indexing. This causes Lucene to take the given
  1126. * fieldname and index the given value against it. NB: we silently ignore
  1127. * the request for nullstring, since these cause Lucene indexing to throw
  1128. * an exception, and indexing will fail.
  1129. * The field name can have the field type included in the form 'Foo:Date',
  1130. * where 'Date' is the type in this instance. In fact, since 'Text' is the
  1131. * default filed type, 'Date' is probably the only one you need to use
  1132. * as the current implementation stands.
  1133. * @param string $fieldname Name of the field to index.
  1134. * @param string $fieldvalue Content of the field to index
  1135. */
  1136. function index_field($fieldname, $fieldvalue) {
  1137. if ($fieldvalue !== "") {
  1138. $fieldname = $this->strip_field_type($fieldname);
  1139. $this->add_field($fieldname, $fieldvalue);
  1140. }
  1141. } // index_field
  1142. // .....................................................................
  1143. /**
  1144. * Index the given content against the given ID. This automatically
  1145. * defines the default field called "Text", and the data added as a field
  1146. * called "Text" as well. Attaches the "Body" tag to this field via a
  1147. * call to add_data() method. Thus, the content is submitted as a raw
  1148. * binary stream, rather than url-encoded text.
  1149. * @param string $id The ID to associate with the given indexed data.
  1150. * @param string $content The binary/text content to be indexed.
  1151. */
  1152. function index_content($id, $content) {
  1153. if ($content !== "") {
  1154. $this->add_xmltag( new xmltag("Id", $id) );
  1155. $content = preg_replace("/[\n\r\t]/", " ", $content);
  1156. $content = preg_replace("/[ ]{2,}/", " ", $content);
  1157. $this->add_field(DEFAULT_FIELD, $content);
  1158. }
  1159. } // index_content
  1160. // .....................................................................
  1161. /**
  1162. * Send the message to Lucene, and then post-process the response for
  1163. * indication of a successful index operation. We expect to receive
  1164. * a response back from Lucene which has our serialno in it. This method
  1165. * returns True if the indexing was successful, else False.
  1166. * @param integer $timeoutsecs Override for timeout in seconds
  1167. * @return boolean True if indexing was successful.
  1168. */
  1169. function send($timeoutsecs="") {
  1170. // Msg-level send-receive transaction..
  1171. lucene_message::send($timeoutsecs);
  1172.  
  1173. // Process the response to our request..
  1174. $this->response = new response_parser();
  1175. $this->response->parse($this->responsebuf);
  1176.  
  1177. // Unpack the response if no errors..
  1178. if ($this->response->valid) {
  1179. $this->indexed = ($this->response->serial == $this->serialno);
  1180. }
  1181. // Return status of indexing operation..
  1182. return $this->indexed;
  1183. } // send
  1184.  
  1185. } // lucene_indexmsg class
  1186. // ----------------------------------------------------------------------
  1187.  
  1188. /**
  1189. * The lucene unindex message class. This class allows you to remove an
  1190. * item from the Lucene index. You must know the unique ID that identifies
  1191. * the document.
  1192. * @package search
  1193. */
  1194. class lucene_unindexmsg extends lucene_message {
  1195. // .....................................................................
  1196. /** Constructor
  1197. * Make a new Lucene unindex message. This message is provided to allow
  1198. * you to delete an item from the Lucene index. It has a single method
  1199. * 'unindex' which takes the ID of the item to delete.
  1200. * @param string $application Optional application specifier
  1201. * @param string $host Hostname or IP of Lucene server
  1202. * @param string $port Port of Lucene server
  1203. */
  1204. function lucene_unindexmsg($application="?", $host="", $port="") {
  1205. $this->lucene_message("LuceneUnIndexRequest", $application, $host, $port);
  1206. } // lucene_unindexmsg
  1207. // .....................................................................
  1208. /**
  1209. * Unindex the given document, as identified by the unique ID. If no errors
  1210. * arise, then the item will be removed from the Lucene index.
  1211. * @param string $id The ID to allow Lucene to identify the item to unindex
  1212. */
  1213. function unindex($id) {
  1214. $this->add_xmltag( new xmltag("Id", $id) );
  1215. } // unindex
  1216.  
  1217. } // lucene_unindexmsg class
  1218. // ----------------------------------------------------------------------
  1219.  
  1220. /**
  1221. * The lucene purge message class. This class allows you to remove all
  1222. * items from the Lucene index. Take care!
  1223. * @package search
  1224. */
  1225. class lucene_purgemsg extends lucene_message {
  1226. // .....................................................................
  1227. /** Constructor
  1228. * Make a new Lucene purge message. This message is provided to allow
  1229. * you to delete all items from the Lucene index.
  1230. * @param string $application Optional application specifier
  1231. * @param string $host Hostname or IP of Lucene server
  1232. * @param string $port Port of Lucene server
  1233. */
  1234. function lucene_purgemsg($application="?", $host="", $port="") {
  1235. $this->lucene_message("LuceneUnIndexRequest", $application, $host, $port);
  1236. $this->add_xmltag( new xmltag("Purge") );
  1237. } // lucene_purgemsg
  1238.  
  1239. } // lucene_purgemsg class
  1240. // ----------------------------------------------------------------------
  1241.  
  1242. /**
  1243. * The lucene utility message class. Used for special Lucene operations.
  1244. * @package search
  1245. */
  1246. class lucene_utilitymsg extends lucene_message {
  1247. /** Constructor
  1248. * @param string $utilitycmd Command for this utility message.
  1249. * @param string $application Optional application specifier
  1250. * @param string $host Hostname or IP of Lucene server
  1251. * @param string $port Port of Lucene server
  1252. */
  1253. function lucene_utilitymsg($utilitycmd="", $application="?", $host="", $port="") {
  1254. $this->lucene_message("LuceneUtilityRequest", $application, $host, $port);
  1255. if ($utilitycmd != "") {
  1256. $this->add_xmltag( new xmltag("Utility", $utilitycmd) );
  1257. }
  1258. } // lucene_utilitymsg
  1259. // .....................................................................
  1260. /**
  1261. * Send the message to Lucene, and then post-process the response for
  1262. * indication of a successful utility operation. We expect to receive
  1263. * a response back from Lucene which has nothing much it, unless there
  1264. * has been an error.
  1265. * returns True if the operation was successful, else False.
  1266. * @param integer $timeoutsecs Override for timeout in seconds
  1267. * @return boolean True if operation was successful.
  1268. */
  1269. function send($timeoutsecs="") {
  1270. // Msg-level send-receive transaction..
  1271. lucene_message::send($timeoutsecs);
  1272.  
  1273. // Process the response to our request..
  1274. $this->response = new response_parser();
  1275. $this->response->parse($this->responsebuf);
  1276.  
  1277. // Return status of indexing operation..
  1278. return $this->response->valid;
  1279. } // send
  1280.  
  1281. } // lucene_utilitymsg class
  1282. // ----------------------------------------------------------------------
  1283.  
  1284. /**
  1285. * The lucene search class
  1286. * This class inherits the functionality of the generic 'search' class. It
  1287. * extends it to implement a LUCENE search. Use the methods in this class
  1288. * as the mainstay in implementing queries of content from Lucene. Most
  1289. * methods, such as match(), matchfield(), matchrange() etc. store the
  1290. * requirement in the class for subsequent building using the set_*()
  1291. * methods of the lucene classes to set the relevant fields. This is only
  1292. * done when you call execute(), and the query is built from all the
  1293. * composite terms you have added via match() et al.
  1294. * @package search
  1295. */
  1296. class lucene_search extends lucene_querymsg {
  1297. // .....................................................................
  1298. /**
  1299. * Constructor
  1300. * Create a new lucene search
  1301. * @param string $application Application name/domain name for searching in
  1302. * @param string $host Hostname or IP of Lucene server
  1303. * @param string $port Port of Lucene server
  1304. */
  1305. function lucene_search($application="?", $host="", $port="") {
  1306. global $RESPONSE;
  1307. $this->search();
  1308. $this->lucene_querymsg($application, $host, $port);
  1309. $this->initialise();
  1310. // Partitioned indexing for microsites..
  1311. if (isset($RESPONSE) && $RESPONSE->microsites_mode == MICROSITES_ENABLED) {
  1312. $site = (isset($RESPONSE->microsite_detected)) ? $RESPONSE->microsite_detected : APP_NAME;
  1313. $this->must_match("site:$site");
  1314. }
  1315. } // lucene_search
  1316. // .....................................................................
  1317. /**
  1318. * Add a new search term to match. Search terms can be a single word or
  1319. * compound patterns, Each time one of these is added, it has an operator
  1320. * associated with it - whether this term is a "may have" (OR), or a
  1321. * "must have" (AND) term.
  1322. * NB: This method overrides the parent method in order to ensure that all
  1323. * boolean logic terms are in upper case as Lucene requires.
  1324. * @param string $term Search term text to match.
  1325. * @param integer $op Joining operator: 'AND', 'OR', 'NOT, 'AND NOT'.
  1326. * @param string $id An optional ID to associate with this search term.
  1327. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1328. */
  1329. function match($term, $op="OR", $id="", $boost="") {
  1330. $LCops = array("/ and /","/ or /","/ not /");
  1331. $UCops = array(" AND "," OR "," NOT ");
  1332. $term = preg_replace($LCops, $UCops, $term);
  1333. if ($boost != "") $term .= "^$boost";
  1334. search::match($term, strtoupper($op), $id);
  1335. } // match
  1336. // .....................................................................
  1337. /**
  1338. * Add search term to match a field value.
  1339. * This is used to add a search term which defines the value that a given
  1340. * field may or may not contain for the search to succeed.
  1341. * For adding terms which are 'free' (as a user might type into a search
  1342. * box for example) then you can use the match() method which this class
  1343. * inherits from the search class.
  1344. * @param string $fieldname Name of field to reference in the index
  1345. * @param mixed $fieldvalue Value or array of values, for field to match
  1346. * @param string $op Operator to join this term to others in the query
  1347. * @param string $id Optional identity tag for this term
  1348. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1349. */
  1350. function matchfield($fieldname, $fieldvalue, $op="OR", $id="", $boost="") {
  1351. debug_trace($this);
  1352. if (!isset($fieldvalue)) return;
  1353. if (!is_array ($fieldvalue)) {
  1354. $fieldvalue = array($fieldvalue);
  1355. }
  1356. $term = "";
  1357. foreach ($fieldvalue as $value) {
  1358. $value = trim($value);
  1359. if ($value != "") {
  1360. $term .= " OR " . $this->fieldterm($fieldname, $value);
  1361. }
  1362. }
  1363. if ($term != "") {
  1364. $term = substr($term, 4); // Get rid of initial OR
  1365. // Call parent function to register the search term..
  1366. $this->match($term, strtoupper($op), $id, $boost);
  1367. }
  1368. debug_trace();
  1369. } // matchfield
  1370. // .....................................................................
  1371. /**
  1372. * Helper function to build field search term
  1373. * @param string $fieldname Name of field to reference in the index
  1374. * @param string $fieldvalue Value of field to match
  1375. * @access private
  1376. */
  1377. function fieldterm($fieldname, $fieldvalue) {
  1378. if ($fieldname != DEFAULT_FIELD) {
  1379. $term = "$fieldname:$fieldvalue";
  1380. }
  1381. else {
  1382. $term = $fieldvalue;
  1383. }
  1384. return $term;
  1385. } // fieldterm
  1386. // .....................................................................
  1387. /**
  1388. * Add search term to match a field value range.
  1389. * This is used to add a search term which defines the range of values that
  1390. * a given field may or may not contain for the search to succeed.
  1391. * NB: This method is always a must match (implied AND) search term. In
  1392. * other words the search is always restricted/refined by it.
  1393. * @param string $fromvalue Lower range value of field to match
  1394. * @param string $tovalue Upper range value of field to match
  1395. * @param string $fieldname Name of field, defaulted to 'Text'
  1396. */
  1397. function matchrange($fromvalue, $tovalue, $fieldname) {
  1398. debug_trace($this);
  1399. $this->set_range($fromvalue, $tovalue, $fieldname);
  1400. debug_trace();
  1401. } // matchrange
  1402. // .....................................................................
  1403. /**
  1404. * Add search term: must match a field value.
  1405. * This is used to add a search term which defines the value that a given
  1406. * field must contain for the search to succeed.
  1407. * @param string $fieldname Name of field
  1408. * @param string $fieldvalue Value of field to match
  1409. * @param string $id Optional identity tag for this term
  1410. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1411. */
  1412. function must_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
  1413. $this->matchfield($fieldname, $fieldvalue, "AND", $id, $boost);
  1414. } // must_matchfield
  1415. // .....................................................................
  1416. /**
  1417. * Add search term: may match a field value.
  1418. * This is used to add a search term which defines the value that a given
  1419. * field may contain for the search to succeed.
  1420. * @param string $fieldname Name of field
  1421. * @param string $fieldvalue Value of field to match
  1422. * @param string $id Optional identity tag for this term
  1423. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1424. */
  1425. function may_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
  1426. $this->matchfield($fieldname, $fieldvalue, "OR", $id, $boost);
  1427. } // may_matchfield
  1428. // .....................................................................
  1429. /**
  1430. * Add search term: must not match a field value.
  1431. * This is used to add a search term which defines the value that a given
  1432. * field must not contain for the search to succeed.
  1433. * @param string $fieldname Name of field
  1434. * @param string $fieldvalue Value of field to match
  1435. * @param string $id Optional identity tag for this term
  1436. * @param numeric $boost Boost factor. Can be a fraction eg. 0.2, or integer 1,2,3..
  1437. */
  1438. function does_not_matchfield($fieldname, $fieldvalue, $id="", $boost="") {
  1439. $this->matchfield($fieldname, $fieldvalue, "NOT", $id, $boost);
  1440. } // does_not_matchfield
  1441. // .....................................................................
  1442. /**
  1443. * Execute the search
  1444. * Here we execute a lucene search, overriding the method in the parent
  1445. * class. This involves building the query string, sending it to the
  1446. * Lucene server, and receiving the search results back.
  1447. * @param integer $timeoutsecs Override for timeout in seconds
  1448. */
  1449. function execute($timeoutsecs="") {
  1450. debug_trace($this);
  1451.  
  1452. // The queryvalid() method is in the parent class 'search', and
  1453. // calls the build() method in the same class. The build() method is
  1454. // a raw routine to join together the search terms with ANDs and
  1455. // ORs. You may have to override it for Lucene. If so, just create
  1456. // a new build() method in this class.
  1457.  
  1458. if ($this->queryvalid()) {
  1459.  
  1460. // Define the query string..
  1461. $this->set_query($this->query);
  1462.  
  1463. // Set limit, offset..
  1464. if ($this->max_results > 0) {
  1465. $this->set_limit($this->max_results);
  1466. if ($this->skip_results > 0) {
  1467. $this->set_first($this->skip_results);
  1468. }
  1469. }
  1470.  
  1471. // Set any daterange..
  1472. if ($this->has_daterange()) {
  1473. $this->set_range($this->date_start, $this->date_end, $this->date_fieldname);
  1474. }
  1475.  
  1476. // Send to Lucene..
  1477. $this->send($timeoutsecs);
  1478.  
  1479. // Flag that we did it..
  1480. $this->executed = true;
  1481. debugbr("lucene search: exec ok: returning " . $this->hitcount() . " hits");
  1482. }
  1483. else {
  1484. debugbr("lucene search: invalid query: '$this->query'");
  1485. }
  1486. debug_trace();
  1487. } // execute
  1488.  
  1489. } // lucene_search class
  1490. // ----------------------------------------------------------------------
  1491.  
  1492. /**
  1493. * Function to optimize the Lucene index. This would commonly
  1494. * be used after a batch of items have been indexed.
  1495. * @param string $application Application name/domain name for searching in
  1496. * @param string $host Hostname or IP of Lucene server
  1497. * @param string $port Port of Lucene server
  1498. * @return boolean True if the operation was successful.
  1499. */
  1500. function lucene_optimize($application="?", $host="", $port="") {
  1501. $optimizer = new lucene_utilitymsg("OPTIMIZE", $application, $host, $port);
  1502. $optimizer->send(SOCK_FOREVER);
  1503. return $optimizer->response->valid;
  1504. } // lucene_optimize
  1505. // ----------------------------------------------------------------------
  1506.  
  1507. /**
  1508. * Function to make a backup of the Lucene index. This would commonly
  1509. * be used after a batch of items have been successfully optimized (which
  1510. * indicates a sound index). The backup will be made to the directory
  1511. * specified in the application .properties file as the property
  1512. * 'Lucene-Backup-Directory=' or, if not there then in the Lucene properties
  1513. * file 'Server.properties' as the same property. If neither of these are
  1514. * defined, the server will attempt to use a sub-directory called
  1515. * {Lucene-Index-Directory}_backup, where {Lucene-Index-Directory} is the
  1516. * index path as already defined in the 'Server.properties' file.
  1517. * @param string $application Application name/domain name for searching in
  1518. * @param string $host Hostname or IP of Lucene server
  1519. * @param string $port Port of Lucene server
  1520. * @return boolean True if the operation was successful.
  1521. */
  1522. function lucene_backup($application="?", $host="", $port="") {
  1523. $backup = new lucene_utilitymsg("BACKUP", $application, $host, $port);
  1524. $backup->send(SOCK_FOREVER);
  1525. return $backup->response->valid;
  1526. } // lucene_backup
  1527. // ----------------------------------------------------------------------
  1528.  
  1529. /**
  1530. * Function to purge the Lucene index of all indexes to documents. Yes,
  1531. * I'll repeat that - it DELETES ALL DOCUMENTS FROM THE INDEX, permanently,
  1532. * finito, shazam, ba-boom, as in "Omigod did I *really* mean to do that!?".
  1533. * I guess I don't have to warn you to be careful with this, do I?
  1534. * @param string $application Application name/domain name for searching in
  1535. * @param string $host Hostname or IP of Lucene server
  1536. * @param string $port Port of Lucene server
  1537. * @return boolean True if the purging operation was successful.
  1538. */
  1539. function lucene_purge($application="?", $host="", $port="") {
  1540. $purgative = new lucene_purgemsg($application, $host, $port);
  1541. $purgative->send(SOCK_FOREVER);
  1542. return $purgative->response->valid;
  1543. } // lucene_purge
  1544. // ----------------------------------------------------------------------
  1545.  
  1546. ?>

Documentation generated by phpDocumentor 1.3.0RC3