Trying out the new context grabbing function in search.

author brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>

Tue, 10 Nov 2009 03:48:41 +0000 (03:48 +0000)

committer brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>

Tue, 10 Nov 2009 03:48:41 +0000 (03:48 +0000)
author brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>
Tue, 10 Nov 2009 03:48:41 +0000 (03:48 +0000)
committer brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>
Tue, 10 Nov 2009 03:48:41 +0000 (03:48 +0000)
diff --git a/mod/search/search_hooks.php b/mod/search/search_hooks.php

index 941e01e8c70f9912ccbf6a5ecb76506fac87123a..f7a49400f36f9e6997944ae85f2f1513f3af0820 100644 (file)
--- a/mod/search/search_hooks.php
+++ b/mod/search/search_hooks.php
@@ -43,10 +43,12 @@ function search_objects_hook($hook, $type, $value, $params) {
         // add the volatile data for why these entities have been returned.
         foreach ($entities as $entity) {
                 //$title = search_get_highlighted_relevant_substrings($entity->title, $params['query']);
-               $title = search_get_relevant_substring($entity->title, $params['query'], '<strong class="searchMatch">', '</strong>');
+               //$title = search_get_relevant_substring($entity->title, $params['query'], '<strong class="searchMatch">', '</strong>');
+               $title = search_get_highlighted_relevant_substrings($entity->title, $params['query']);
                 $entity->setVolatileData('search_matched_title', $title);
  
-               $desc = search_get_relevant_substring($entity->description, $params['query'], '<strong class="searchMatch">', '</strong>');
+               //$desc = search_get_relevant_substring($entity->description, $params['query'], '<strong class="searchMatch">', '</strong>');
+               $desc = search_get_highlighted_relevant_substrings($entity->description, $params['query']);
                 $entity->setVolatileData('search_matched_description', $desc);
         }
  
@@ -91,10 +93,10 @@ function search_groups_hook($hook, $type, $value, $params) {
  
         // add the volatile data for why these entities have been returned.
         foreach ($entities as $entity) {
-               $description = search_get_relevant_substring($entity->description, $query, '<strong class="searchMatch">', '</strong>');
+               $description = search_get_highlighted_relevant_substrings($entity->description, $query);
                 $entity->setVolatileData('search_matched_title', $description);
  
-               $name = search_get_relevant_substring($entity->name, $query, '<strong class="searchMatch">', '</strong>');
+               $name = search_get_highlighted_relevant_substrings($entity->name, $query);
                 $entity->setVolatileData('search_matched_description', $name);
         }
  
@@ -139,10 +141,10 @@ function search_users_hook($hook, $type, $value, $params) {
  
         // add the volatile data for why these entities have been returned.
         foreach ($entities as $entity) {
-               $username = search_get_relevant_substring($entity->username, $query, '<strong class="searchMatch">', '</strong>');
+               $username = search_get_highlighted_relevant_substrings($entity->username, $query);
                 $entity->setVolatileData('search_matched_title', $username);
  
-               $name = search_get_relevant_substring($entity->name, $query, '<strong class="searchMatch">', '</strong>');
+               $name = search_get_highlighted_relevant_substrings($entity->name, $query);
                 $entity->setVolatileData('search_matched_description', $name);
         }
  
@@ -180,7 +182,7 @@ function search_tags_hook($hook, $type, $value, $params) {
         // add the volatile data for why these entities have been returned.
         foreach ($entities as $entity) {
                 $tags = implode(',', $entity->tags);
-               $tags_str = search_get_relevant_substring($tags, $query, '<strong class="searchMatch">', '</strong>');
+               $tags_str = search_get_highlighted_relevant_substrings($tags, $params['query']);
                 $entity->setVolatileData('search_matched_tags', $tags_str);
         }
  
@@ -253,7 +255,7 @@ function search_comments_hook($hook, $type, $value, $params) {
                 if (!$entity = get_entity($comment->entity_guid)) {
                         continue;
                 }
-               $comment_str = search_get_relevant_substring($comment->comment, $query, '<strong class="searchMatch">', '</strong>');
+               $comment_str = search_get_highlighted_relevant_substrings($comment->comment, $query);
                 $entity->setVolatileData('search_matched_comment', $comment_str);
                 $entity->setVolatileData('search_matched_comment_owner_guid', $comment->owner_guid);
                 $entity->setVolatileData('search_matched_comment_time_created', $comment->time_created);
diff --git a/mod/search/start.php b/mod/search/start.php

index aa76c13b2b581bff21ee54c8d92ce8eeea891f4a..a53cebbb63cae6d052c873955314af375de9952d 100644 (file)
--- a/mod/search/start.php
+++ b/mod/search/start.php
@@ -87,8 +87,16 @@ function search_page_handler($page) {
   * @return unknown_type
   */
  function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 250) {
+       global $CONFIG;
         $haystack = strip_tags($haystack);
         $haystack_lc = strtolower($haystack);
+//
+//     $haystack = "Like merge sort, quicksort can also be easily parallelized due to its "
+//             . "divide-and-conquer nature. Individual in-place partition operations are difficult "
+//             . "to parallelize, but once divided, different sections of the list can be sorted in parallel.  "
+//             . "If we have p processors, we can divide a list of n ele";
+//
+//     $needle = 'difficult to sort in parallel';
  
         // for now don't worry about "s or boolean operators
         $needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle)));
@@ -105,8 +113,30 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat
                 }
         }
  
+       /*
+
+       $body_len = 250
+
+       $context = 5-30, 20-45, 75-100, 150
+
+       can pull out context either on:
+               one of each matching term
+               X # of highest matching terms
+
+
+       */
         $substr_counts = array();
         $str_pos = array();
+       // matrices for being and end context lengths.
+       // defaults to min context.  will add additional context later if needed
+       $starts = array();
+       $stops = array();
+
+       // map the words to the starts and stops
+       $words_arg = array();
+       $context_count = 0;
+
+
         // get the full count of matches.
         foreach ($words as $word) {
                 $word = strtolower($word);
@@ -114,29 +144,163 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat
                 $word_len = strlen($word);
  
                 // find the start positions for the words
-               // get the context for words based upon
                 if ($count > 1) {
                         $str_pos[$word] = array();
                         $offset = 0;
                         while (FALSE !== $pos = strpos($haystack, $word, $offset)) {
                                 $str_pos[$word][] = $pos;
+                               $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+                               $stops[] = $pos + $word_len + $min_match_context;
+                               $words_arg[] = $word;
+                               $context_count += $min_match_context + $word_len;
                                 $offset += $pos + $word_len;
                         }
                 } else {
-                       $str_pos[$word] = array(strpos($haystack, $word));
+                       $pos = strpos($haystack, $word);
+                       $str_pos[$word] = array($pos);
+                       $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+                       $stops[] = $pos + $word_len + $min_match_context;
+                       $context_count += $min_match_context + $word_len;
+                       $words_arg[] = $word;
                 }
                 $substr_counts[$word] = $count;
         }
  
-//A test with multiple words and now more in the subject too because words need to be everywhere
-
         // sort by order of occurence
-       krsort($substr_counts);
+       //krsort($substr_counts);
         $full_count = array_sum($substr_counts);
  
+       // figure out what the context needs to be.
+       // take one of each matched phrase
+       // if there are any
+
+//
+//     var_dump($str_pos);
+//     var_dump($substr_counts);
+//     var_dump($context_count);
+
+
+       // sort to put them in order of occurence
+       asort($starts, SORT_NUMERIC);
+       asort($stops, SORT_NUMERIC);
+
+       // offset them correctly
+       $starts[] = 0;
+       $new_stops = array(0);
+       foreach ($stops as $i => $pos) {
+               $new_stops[$i+1] = $pos;
+       }
+       $stops = $new_stops;
+
+       $substrings = array();
+       $len = count($starts);
+
+       $starts = array_merge($starts);
+       $stops = array_merge($stops);
+
+       $offsets = array();
+       $limits = array();
+       $c = 0;
+       foreach ($starts as $i => $start) {
+               $stop = $stops[$i];
+               $offsets[$c] = $start;
+               $limits[$c] = $stop;
+
+               // never need the last one as it's just a displacing entry
+               if ($c+1 == count($starts)) {
+                       break;
+               }
+
+               if ($start - $stop < 0) {
+                       //var_dump("Looking at c=$c & $start - $stop and going to unset {$limits[$c]}");
+                       unset($offsets[$c]);
+                       unset($limits[$c]);
+               }
+               $c++;
+       }
+
+       // reset indexes and remove placeholder elements.
+       $limits = array_merge($limits);
+       array_shift($limits);
+       $offsets = array_merge($offsets);
+       array_pop($offsets);
+
+       // figure out if we need to adjust the offsets from the base
+       // this could result in overlapping summaries.
+       // might be nicer to just remove it.
+
+       $total_len = 0;
+       foreach ($offsets as $i => $offset) {
+               $total_len += $limits[$i] - $offset;
+       }
+
+       $add_length = 0;
+       if ($total_length < $max_length) {
+               $add_length = floor((($max_length - $total_len) / count($offsets)) / 2);
+       }
+
+
+       foreach ($offsets as $i => $offset) {
+               $limit = $limits[$i];
+               if ($offset == 0 && $add_length) {
+                       $limit += $add_length;
+               } else {
+                       $offset = $offset - $add_length;
+               }
+               $string = substr($haystack, $offset, $limit - $offset);
+
+               if ($limit-$offset < strlen($haystack)) {
+                       $string = "$string...";
+               }
+
+               $substrings[] = $string;
+       }
  
+       $matched = '';
+       foreach ($substrings as $string) {
+               if (strlen($matched) + strlen($string) < $max_length) {
+                       $matched .= $string;
+               }
+       }
+
+       foreach ($words as $word) {
+               $search = "/($word)/i";
+               $replace = "<strong class=\"searchMatch\">$1</strong>";
+               $matched = preg_replace($search, $replace, $matched);
+       }
+
+       return $matched;
  
  
+       // crap below..
+
+
+
+       for ($i=0; $i<$len; $i++) {
+               $start = $starts[$i];
+               $stop = $stops[$i];
+               var_dump("Looking at $i = $start - $stop");
+
+               while ($start - $stop <= 0) {
+                       $stop = $stops[$i++];
+                       var_dump("New start is $stop");
+               }
+
+               var_dump("$start-$stop");
+       }
+
+       // find the intersecting contexts
+       foreach ($starts as $i => $start_pos) {
+               $words .= "{$words_arg[$i]}\t\t\t";
+               echo "$start_pos\t\t\t";
+       }
+
+       echo "\n";
+
+       foreach ($stops as $i => $stop_pos) {
+               echo "$stop_pos\t\t\t";
+       }
+echo "\n$words\n";
  
         // get full number of matches against all words to see how many we actually want to look at.
  
@@ -170,8 +334,6 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat
                                                 $word .= " {$words_orig[$word_i]}";
                                                 unset($words_orig[$word_i]);
                                         }
-
-
                                 }
  
                                 break;
@@ -255,8 +417,12 @@ function search_get_relevant_substring($haystack, $needle, $before = '', $after
         }
  
         // surround if needed
+       // @todo would getting each position of the match then
+       // inserting manually based on the position be faster than preg_replace()?
         if ($before || $after) {
                 $matched = str_ireplace($needle, $before . $needle . $after, $matched);
+               //$matched = mb_ereg_replace("")
+               // insert before
         }
  
         return $matched;
author	brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>
	Tue, 10 Nov 2009 03:48:41 +0000 (03:48 +0000)
committer	brettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>
	Tue, 10 Nov 2009 03:48:41 +0000 (03:48 +0000)
mod/search/search_hooks.php		patch \| blob \| history
mod/search/start.php		patch \| blob \| history