Found at: http://publish.ez.no/article/articleprint/83/ |
Building a fulltext search engine with PHP |
In this article I will show you how you can use the database library from eZ publish to build database independent fulltext search engine. I will show you a relatively simple algorithm to get search result ranking and stop words. You will get the full source code for a complete working program to get you started.
DROP TABLE object; CREATE TABLE object ( id int primary key auto_increment, data text ); DROP TABLE object_word_link; CREATE TABLE object_word_link ( id int primary key auto_increment, object_id int not null, word_id int not null, frequency float not null ); drop table word; CREATE TABLE word ( id int primary key auto_increment, word varchar( 150 ), object_count int not null ); CREATE INDEX object_word_link_object_id ON object_word_link (object_id); CREATE INDEX object_word_link_wird_id ON object_word_link (word_id); CREATE INDEX object_word_link_frequency ON object_word_link (frequency); CREATE INDEX word_object_count ON word (object_count); |
// Strip multiple whitespaces
$IndexText = str_replace(".", " ", $IndexText );
$IndexText = str_replace(",", " ", $IndexText );
$IndexText = str_replace("'", " ", $IndexText );
$IndexText = str_replace("\"", " ", $IndexText );
$IndexText = str_replace("\n", " ", $IndexText );
$IndexText = str_replace("\r", " ", $IndexText );
$IndexText = preg_replace("(\s+)", " ", $IndexText );
// Split text on whitespace
$indexArray =& split( " ", $IndexText );
// Count the total words in index text
$totalWordCount = count( $indexArray );
// Count the number of instances of each word
$wordCountArray = array_count_values( $indexArray );
// Strip double words
$indexArray = array_unique( $indexArray );
// Count unique words
$uniqueWordCount = count( $indexArray ); |
foreach ( $indexArray as $indexWord )
{
// Store word if it does not exist.
$wordRes = array();
$db->array_query( $wordRes, "SELECT * FROM word WHERE word='$indexWord'" );
if ( count( $wordRes ) == 1 )
{
$wordID = $wordRes[0]["id"];
$db->query( "UPDATE word SET object_count=( object_count + 1 ) WHERE id='$wordID'" );
}
else
{
$wordID = $db->nextID( "word", "id" );
$db->query( "INSERT INTO word ( id, word, object_count ) VALUES ( '$wordID', '$indexWord', '1' )" );
}
print( "Indexing word: '$indexWord'" );
// Calculate the relevans ranking for this word
$frequency = ( $wordCountArray[$indexWord] / $totalWordCount );
print( "Internal normalized word frequency: $frequency" );
$linkID = $db->nextID( "object_word_link", "id" );
$db->query( "INSERT INTO
object_word_link ( id, word_id, object_id, frequency )
VALUES ( '$nextID', '$wordID', '$objectID', '$frequency' )" );
} |
// Strip multiple whitespace
$SearchText = preg_replace("(\s+)", " ", $db->escapeString( $SearchText ) );
// Split text on whitespace
$searchArray =& split( " ", $SearchText ); |
// Get the total number of objects
$objectCount = array();
$db->array_query( $objectCount, "SELECT COUNT(*) AS count FROM object" );
$totalObjectCount = $objectCount[0]["count"];
// Search words can at most be present in 70% of the objects
$stopWordFrequency = 0.7;
$wordSQL = "";
$i = 0;
// Build the word query string
foreach ( $searchArray as $searchWord )
{
if ( $i == 0 )
$wordSQL .= "word.word='" .strToLower( $searchWord ) ."' ";
else
$wordSQL .= " OR word.word='" .strToLower( $searchWord ) ."' ";
$i++;
}
// Build the full search query using logical OR if multiple words are searched on
$searchQuery = "SELECT object.id, object_word_link.frequency
FROM object, object_word_link, word
WHERE object.id=object_word_link.object_id
AND word.id=object_word_link.word_id
AND ( $wordSQL )
AND ( ( word.object_count / $totalObjectCount ) < $stopWordFrequency )
ORDER BY object_word_link.frequency DESC";
$objectRes = array();
// Execute the query
$db->array_query( $objectRes, $searchQuery ); |
Attached files: