friendica-github/library/spam/b8/b8.php.ORIG

<?php

#   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
#
#   b8 - A Bayesian spam filter written in PHP 5
#
#   This program is free software; you can redistribute it and/or modify it
#   under the terms of the GNU Lesser General Public License as published by
#   the Free Software Foundation in version 2.1 of the License.
#
#   This program is distributed in the hope that it will be useful, but
#   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
#   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
#   License for more details.
#
#   You should have received a copy of the GNU Lesser General Public License
#   along with this program; if not, write to the Free Software Foundation,
#   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

/**
 * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
 *
 * @license LGPL
 * @access public
 * @package b8
 * @author Tobias Leupold
 * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
 */

class b8
{

	public $config = array(
		'min_size'      => 3,
		'max_size'      => 30,
		'allow_numbers' => FALSE,
		'lexer'         => 'default',
		'degenerator'   => 'default',
		'storage'       => 'dba',
		'use_relevant'  => 15,
		'min_dev'       => 0.2,
		'rob_s'         => 0.3,
		'rob_x'         => 0.5
	);

	private $_lexer      = NULL;
	private $_database   = NULL;
	private $_token_data = NULL;

	const SPAM    = 'spam';
	const HAM     = 'ham';
	const LEARN   = 'learn';
	const UNLEARN = 'unlearn';

	const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';
	const STARTUP_FAIL_LEXER    = 'STARTUP_FAIL_LEXER';
	const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';

	/**
	 * Constructs b8
	 *
	 * @access public
	 * @return void
	 */

	function __construct($config = array(), $database_config)
	{

		# Validate config data

		if(count($config) > 0) {

			foreach ($config as $name=>$value) {

				switch($name) {

					case 'min_dev':
					case 'rob_s':
					case 'rob_x':
						$this->config[$name] = (float) $value;
						break;

					case 'min_size':
					case 'max_size':
					case 'use_relevant':
						$this->config[$name] = (int) $value;
						break;

					case 'allow_numbers':
						$this->config[$name] = (bool) $value;
						break;

					case 'lexer':
						$value = (string) strtolower($value);
						$this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';
						break;

					case 'storage':
						$this->config[$name] = (string) $value;
						break;

				}

			}

		}

		# Setup the database backend

		# Get the basic storage class used by all backends
		if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)
			return;

		# Get the degenerator we need
		if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)
			return;

		# Get the actual storage backend we need
		if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)
			return;

		# Setup the backend
		$class = 'b8_storage_' . $this->config['storage'];
		$this->_database = new $class(
			$database_config,
			$this->config['degenerator'], date('ymd')
		);

		# Setup the lexer class

		if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)
			return;

		$class = 'b8_lexer_' . $this->config['lexer'];
		$this->_lexer = new $class(
			array(
				'min_size' => $this->config['min_size'],
				'max_size' => $this->config['max_size'],
				'allow_numbers' => $this->config['allow_numbers']
			)
		);

	}

	/**
	 * Load a class file if a class has not been defined yet.
	 *
	 * @access public
	 * @return boolean Returns TRUE if everything is okay, otherwise FALSE.
	 */

	public function load_class($class_name, $class_file)
	{

		if(class_exists($class_name, FALSE) === FALSE) {

			$included = require_once $class_file;

			if($included === FALSE or class_exists($class_name, FALSE) === FALSE)
				return FALSE;

		}

		return TRUE;

	}

	/**
	 * Validates the class has all it needs to work.
	 *
	 * @access public
	 * @return mixed Returns TRUE if everything is okay, otherwise an error code.
	 */

	public function validate()
	{

		if($this->_database === NULL)
			return self::STARTUP_FAIL_DATABASE;

		# Connect the database backend if we aren't connected yet

		elseif($this->_database->connected === FALSE) {

			$connection = $this->_database->connect();

			if($connection !== TRUE)
				return $connection;

		}

		if($this->_lexer === NULL)
			return self::STARTUP_FAIL_LEXER;

		return TRUE;

	}

	/**
	 * Classifies a text
	 *
	 * @access public
	 * @package default
	 * @param string $text
	 * @return float The rating between 0 (ham) and 1 (spam)
	 */

	public function classify($text)
	{

		# Validate the startup

		$started_up = $this->validate();

		if($started_up !== TRUE)
			return $started_up;

		# Get the internal database variables, containing the number of ham and
		# spam texts so the spam probability can be calculated in relation to them
		$internals = $this->_database->get_internals();

		# Calculate the spamminess of all tokens

		# Get all tokens we want to rate

		$tokens = $this->_lexer->get_tokens($text);

		# Check if the lexer failed
		# (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
		if(!is_array($tokens))
			return $tokens;

		# Fetch all availible data for the token set from the database
		$this->_token_data = $this->_database->get(array_keys($tokens));

		# Calculate the spamminess and importance for each token (or a degenerated form of it)

		$word_count = array();
		$rating     = array();
		$importance = array();

		foreach($tokens as $word => $count) {

			$word_count[$word] = $count;

			# Although we only call this function only here ... let's do the
			# calculation stuff in a function to make this a bit less confusing ;-)
			$rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);

			$importance[$word] = abs(0.5 - $rating[$word]);

		}

		# Order by importance
		arsort($importance);
		reset($importance);

		# Get the most interesting tokens (use all if we have less than the given number)

		$relevant = array();

		for($i = 0; $i < $this->config['use_relevant']; $i++) {

			if($tmp = each($importance)) {

				# Important tokens remain

				# If the token's rating is relevant enough, use it

				if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {

					# Tokens that appear more than once also count more than once

					for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)
						array_push($relevant, $rating[$tmp['key']]);

				}

			}

			else {
				# We have less than words to use, so we already
				# use what we have and can break here
				break;
			}

		}

		# Calculate the spamminess of the text (thanks to Mr. Robinson ;-)
		# We set both hamminess and Spamminess to 1 for the first multiplying
		$hamminess  = 1;
		$spamminess = 1;

		# Consider all relevant ratings
		foreach($relevant as $value) {
			$hamminess  *= (1.0 - $value);
			$spamminess *= $value;
		}

		# If no token was good for calculation, we really don't know how
		# to rate this text; so we assume a spam and ham probability of 0.5

		if($hamminess === 1 and $spamminess === 1) {
			$hamminess = 0.5;
			$spamminess = 0.5;
			$n = 1;
		}
		else {
			# Get the number of relevant ratings
			$n = count($relevant);
		}

		# Calculate the combined rating

		# The actual hamminess and spamminess
		$hamminess  = 1 - pow($hamminess,  (1 / $n));
		$spamminess = 1 - pow($spamminess, (1 / $n));

		# Calculate the combined indicator
		$probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);

		# We want a value between 0 and 1, not between -1 and +1, so ...
		$probability = (1 + $probability) / 2;

		# Alea iacta est
		return $probability;

	}

	/**
	 * Calculate the spamminess of a single token also considering "degenerated" versions
	 *
	 * @access private
	 * @param string $word
	 * @param string $texts_ham
	 * @param string $texts_spam
	 * @return void
	 */

	private function _get_probability($word, $texts_ham, $texts_spam)
	{

		# Let's see what we have!

		if(isset($this->_token_data['tokens'][$word]) === TRUE) {
			# The token was in the database, so we can use it's data as-is
			# and calculate the spamminess of this token directly
			return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);
		}

		# Damn. The token was not found, so do we have at least similar words?

		if(isset($this->_token_data['degenerates'][$word]) === TRUE) {

			# We found similar words, so calculate the spamminess for each one
			# and choose the most important one for the further calculation

			# The default rating is 0.5 simply saying nothing
			$rating = 0.5;

			foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {

				# Calculate the rating of the current degenerated token
				$rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);

				# Is it more important than the rating of another degenerated version?
				if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))
					$rating = $rating_tmp;

			}

			return $rating;

		}

		else {
			# The token is really unknown, so choose the default rating
			# for completely unknown tokens. This strips down to the
			# robX parameter so we can cheap out the freaky math ;-)
			return $this->config['rob_x'];
		}

	}

	/**
	 * Do the actual spamminess calculation of a single token
	 *
	 * @access private
	 * @param array $data
	 * @param string $texts_ham
	 * @param string $texts_spam
	 * @return void
	 */

	private function _calc_probability($data, $texts_ham, $texts_spam)
	{

		# Calculate the basic probability by Mr. Graham

		# But: consider the number of ham and spam texts saved instead of the
		# number of entries where the token appeared to calculate a relative
		# spamminess because we count tokens appearing multiple times not just
		# once but as often as they appear in the learned texts

		$rel_ham = $data['count_ham'];
		$rel_spam = $data['count_spam'];

		if($texts_ham > 0)
			$rel_ham = $data['count_ham'] / $texts_ham;

		if($texts_spam > 0)
			$rel_spam = $data['count_spam'] / $texts_spam;

		$rating = $rel_spam / ($rel_ham + $rel_spam);

		# Calculate the better probability proposed by Mr. Robinson
		$all = $data['count_ham'] + $data['count_spam'];
		return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);

	}

	/**
	 * Check the validity of the category of a request
	 *
	 * @access private
	 * @param string $category
	 * @return void
	 */

	private function _check_category($category)
	{
		return $category === self::HAM or $category === self::SPAM;
	}

	/**
	 * Learn a reference text
	 *
	 * @access public
	 * @param string $text
	 * @param const $category Either b8::SPAM or b8::HAM
	 * @return void
	 */

	public function learn($text, $category)
	{
		return $this->_process_text($text, $category, self::LEARN);
	}

	/**
	 * Unlearn a reference text
	 *
	 * @access public
	 * @param string $text
	 * @param const $category Either b8::SPAM or b8::HAM
	 * @return void
	 */

	public function unlearn($text, $category)
	{
		return $this->_process_text($text, $category, self::UNLEARN);
	}

	/**
	 * Does the actual interaction with the storage backend for learning or unlearning texts
	 *
	 * @access private
	 * @param string $text
	 * @param const $category Either b8::SPAM or b8::HAM
	 * @param const $action Either b8::LEARN or b8::UNLEARN
	 * @return void
	 */

	private function _process_text($text, $category, $action)
	{

		# Validate the startup

		$started_up = $this->validate();

		if($started_up !== TRUE)
			return $started_up;

		# Look if the request is okay
		if($this->_check_category($category) === FALSE)
			return self::TRAINER_CATEGORY_FAIL;

		# Get all tokens from $text

		$tokens = $this->_lexer->get_tokens($text);

		# Check if the lexer failed
		# (if so, $tokens will be a lexer error code, if not, $tokens will be an array)
		if(!is_array($tokens))
			return $tokens;

		# Pass the tokens and what to do with it to the storage backend
		return $this->_database->process_text($tokens, $category, $action);

	}

}

?>
add uid variable to b8 classes 2012-02-02 03:23:05 +00:00			`<?php`

			`# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>`
			`#`
			`# b8 - A Bayesian spam filter written in PHP 5`
			`#`
			`# This program is free software; you can redistribute it and/or modify it`
			`# under the terms of the GNU Lesser General Public License as published by`
			`# the Free Software Foundation in version 2.1 of the License.`
			`#`
			`# This program is distributed in the hope that it will be useful, but`
			`# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public`
			`# License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with this program; if not, write to the Free Software Foundation,`
			`# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.`

			`/**`
			`* Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>`
			`*`
			`* @license LGPL`
			`* @access public`
			`* @package b8`
			`* @author Tobias Leupold`
			`* @author Oliver Lillie (aka buggedcom) (original PHP 5 port)`
			`*/`

			`class b8`
			`{`

			`public $config = array(`
			`'min_size' => 3,`
			`'max_size' => 30,`
			`'allow_numbers' => FALSE,`
			`'lexer' => 'default',`
			`'degenerator' => 'default',`
			`'storage' => 'dba',`
			`'use_relevant' => 15,`
			`'min_dev' => 0.2,`
			`'rob_s' => 0.3,`
			`'rob_x' => 0.5`
			`);`

			`private $_lexer = NULL;`
			`private $_database = NULL;`
			`private $_token_data = NULL;`

			`const SPAM = 'spam';`
			`const HAM = 'ham';`
			`const LEARN = 'learn';`
			`const UNLEARN = 'unlearn';`

			`const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE';`
			`const STARTUP_FAIL_LEXER = 'STARTUP_FAIL_LEXER';`
			`const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL';`

			`/**`
			`* Constructs b8`
			`*`
			`* @access public`
			`* @return void`
			`*/`

			`function __construct($config = array(), $database_config)`
			`{`

			`# Validate config data`

			`if(count($config) > 0) {`

			`foreach ($config as $name=>$value) {`

			`switch($name) {`

			`case 'min_dev':`
			`case 'rob_s':`
			`case 'rob_x':`
			`$this->config[$name] = (float) $value;`
			`break;`

			`case 'min_size':`
			`case 'max_size':`
			`case 'use_relevant':`
			`$this->config[$name] = (int) $value;`
			`break;`

			`case 'allow_numbers':`
			`$this->config[$name] = (bool) $value;`
			`break;`

			`case 'lexer':`
			`$value = (string) strtolower($value);`
			`$this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default';`
			`break;`

			`case 'storage':`
			`$this->config[$name] = (string) $value;`
			`break;`

			`}`

			`}`

			`}`

			`# Setup the database backend`

			`# Get the basic storage class used by all backends`
			`if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE)`
			`return;`

			`# Get the degenerator we need`
			`if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE)`
			`return;`

			`# Get the actual storage backend we need`
			`if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE)`
			`return;`

			`# Setup the backend`
			`$class = 'b8_storage_' . $this->config['storage'];`
			`$this->_database = new $class(`
			`$database_config,`
			`$this->config['degenerator'], date('ymd')`
			`);`

			`# Setup the lexer class`

			`if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE)`
			`return;`

			`$class = 'b8_lexer_' . $this->config['lexer'];`
			`$this->_lexer = new $class(`
			`array(`
			`'min_size' => $this->config['min_size'],`
			`'max_size' => $this->config['max_size'],`
			`'allow_numbers' => $this->config['allow_numbers']`
			`)`
			`);`

			`}`

			`/**`
			`* Load a class file if a class has not been defined yet.`
			`*`
			`* @access public`
			`* @return boolean Returns TRUE if everything is okay, otherwise FALSE.`
			`*/`

			`public function load_class($class_name, $class_file)`
			`{`

			`if(class_exists($class_name, FALSE) === FALSE) {`

			`$included = require_once $class_file;`

			`if($included === FALSE or class_exists($class_name, FALSE) === FALSE)`
			`return FALSE;`

			`}`

			`return TRUE;`

			`}`

			`/**`
			`* Validates the class has all it needs to work.`
			`*`
			`* @access public`
			`* @return mixed Returns TRUE if everything is okay, otherwise an error code.`
			`*/`

			`public function validate()`
			`{`

			`if($this->_database === NULL)`
			`return self::STARTUP_FAIL_DATABASE;`

			`# Connect the database backend if we aren't connected yet`

			`elseif($this->_database->connected === FALSE) {`

			`$connection = $this->_database->connect();`

			`if($connection !== TRUE)`
			`return $connection;`

			`}`

			`if($this->_lexer === NULL)`
			`return self::STARTUP_FAIL_LEXER;`

			`return TRUE;`

			`}`

			`/**`
			`* Classifies a text`
			`*`
			`* @access public`
			`* @package default`
			`* @param string $text`
			`* @return float The rating between 0 (ham) and 1 (spam)`
			`*/`

			`public function classify($text)`
			`{`

			`# Validate the startup`

			`$started_up = $this->validate();`

			`if($started_up !== TRUE)`
			`return $started_up;`

			`# Get the internal database variables, containing the number of ham and`
			`# spam texts so the spam probability can be calculated in relation to them`
			`$internals = $this->_database->get_internals();`

			`# Calculate the spamminess of all tokens`

			`# Get all tokens we want to rate`

			`$tokens = $this->_lexer->get_tokens($text);`

			`# Check if the lexer failed`
			`# (if so, $tokens will be a lexer error code, if not, $tokens will be an array)`
			`if(!is_array($tokens))`
			`return $tokens;`

			`# Fetch all availible data for the token set from the database`
			`$this->_token_data = $this->_database->get(array_keys($tokens));`

			`# Calculate the spamminess and importance for each token (or a degenerated form of it)`

			`$word_count = array();`
			`$rating = array();`
			`$importance = array();`

			`foreach($tokens as $word => $count) {`

			`$word_count[$word] = $count;`

			`# Although we only call this function only here ... let's do the`
			`# calculation stuff in a function to make this a bit less confusing ;-)`
			`$rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']);`

			`$importance[$word] = abs(0.5 - $rating[$word]);`

			`}`

			`# Order by importance`
			`arsort($importance);`
			`reset($importance);`

			`# Get the most interesting tokens (use all if we have less than the given number)`

			`$relevant = array();`

			`for($i = 0; $i < $this->config['use_relevant']; $i++) {`

			`if($tmp = each($importance)) {`

			`# Important tokens remain`

			`# If the token's rating is relevant enough, use it`

			`if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) {`

			`# Tokens that appear more than once also count more than once`

			`for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++)`
			`array_push($relevant, $rating[$tmp['key']]);`

			`}`

			`}`

			`else {`
			`# We have less than words to use, so we already`
			`# use what we have and can break here`
			`break;`
			`}`

			`}`

			`# Calculate the spamminess of the text (thanks to Mr. Robinson ;-)`
			`# We set both hamminess and Spamminess to 1 for the first multiplying`
			`$hamminess = 1;`
			`$spamminess = 1;`

			`# Consider all relevant ratings`
			`foreach($relevant as $value) {`
			`$hamminess *= (1.0 - $value);`
			`$spamminess *= $value;`
			`}`

			`# If no token was good for calculation, we really don't know how`
			`# to rate this text; so we assume a spam and ham probability of 0.5`

			`if($hamminess === 1 and $spamminess === 1) {`
			`$hamminess = 0.5;`
			`$spamminess = 0.5;`
			`$n = 1;`
			`}`
			`else {`
			`# Get the number of relevant ratings`
			`$n = count($relevant);`
			`}`

			`# Calculate the combined rating`

			`# The actual hamminess and spamminess`
			`$hamminess = 1 - pow($hamminess, (1 / $n));`
			`$spamminess = 1 - pow($spamminess, (1 / $n));`

			`# Calculate the combined indicator`
			`$probability = ($hamminess - $spamminess) / ($hamminess + $spamminess);`

			`# We want a value between 0 and 1, not between -1 and +1, so ...`
			`$probability = (1 + $probability) / 2;`

			`# Alea iacta est`
			`return $probability;`

			`}`

			`/**`
			`* Calculate the spamminess of a single token also considering "degenerated" versions`
			`*`
			`* @access private`
			`* @param string $word`
			`* @param string $texts_ham`
			`* @param string $texts_spam`
			`* @return void`
			`*/`

			`private function _get_probability($word, $texts_ham, $texts_spam)`
			`{`

			`# Let's see what we have!`

			`if(isset($this->_token_data['tokens'][$word]) === TRUE) {`
			`# The token was in the database, so we can use it's data as-is`
			`# and calculate the spamminess of this token directly`
			`return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam);`
			`}`

			`# Damn. The token was not found, so do we have at least similar words?`

			`if(isset($this->_token_data['degenerates'][$word]) === TRUE) {`

			`# We found similar words, so calculate the spamminess for each one`
			`# and choose the most important one for the further calculation`

			`# The default rating is 0.5 simply saying nothing`
			`$rating = 0.5;`

			`foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) {`

			`# Calculate the rating of the current degenerated token`
			`$rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam);`

			`# Is it more important than the rating of another degenerated version?`
			`if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating))`
			`$rating = $rating_tmp;`

			`}`

			`return $rating;`

			`}`

			`else {`
			`# The token is really unknown, so choose the default rating`
			`# for completely unknown tokens. This strips down to the`
			`# robX parameter so we can cheap out the freaky math ;-)`
			`return $this->config['rob_x'];`
			`}`

			`}`

			`/**`
			`* Do the actual spamminess calculation of a single token`
			`*`
			`* @access private`
			`* @param array $data`
			`* @param string $texts_ham`
			`* @param string $texts_spam`
			`* @return void`
			`*/`

			`private function _calc_probability($data, $texts_ham, $texts_spam)`
			`{`

			`# Calculate the basic probability by Mr. Graham`

			`# But: consider the number of ham and spam texts saved instead of the`
			`# number of entries where the token appeared to calculate a relative`
			`# spamminess because we count tokens appearing multiple times not just`
			`# once but as often as they appear in the learned texts`

			`$rel_ham = $data['count_ham'];`
			`$rel_spam = $data['count_spam'];`

			`if($texts_ham > 0)`
			`$rel_ham = $data['count_ham'] / $texts_ham;`

			`if($texts_spam > 0)`
			`$rel_spam = $data['count_spam'] / $texts_spam;`

			`$rating = $rel_spam / ($rel_ham + $rel_spam);`

			`# Calculate the better probability proposed by Mr. Robinson`
			`$all = $data['count_ham'] + $data['count_spam'];`
			`return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all);`

			`}`

			`/**`
			`* Check the validity of the category of a request`
			`*`
			`* @access private`
			`* @param string $category`
			`* @return void`
			`*/`

			`private function _check_category($category)`
			`{`
			`return $category === self::HAM or $category === self::SPAM;`
			`}`

			`/**`
			`* Learn a reference text`
			`*`
			`* @access public`
			`* @param string $text`
			`* @param const $category Either b8::SPAM or b8::HAM`
			`* @return void`
			`*/`

			`public function learn($text, $category)`
			`{`
			`return $this->_process_text($text, $category, self::LEARN);`
			`}`

			`/**`
			`* Unlearn a reference text`
			`*`
			`* @access public`
			`* @param string $text`
			`* @param const $category Either b8::SPAM or b8::HAM`
			`* @return void`
			`*/`

			`public function unlearn($text, $category)`
			`{`
			`return $this->_process_text($text, $category, self::UNLEARN);`
			`}`

			`/**`
			`* Does the actual interaction with the storage backend for learning or unlearning texts`
			`*`
			`* @access private`
			`* @param string $text`
			`* @param const $category Either b8::SPAM or b8::HAM`
			`* @param const $action Either b8::LEARN or b8::UNLEARN`
			`* @return void`
			`*/`

			`private function _process_text($text, $category, $action)`
			`{`

			`# Validate the startup`

			`$started_up = $this->validate();`

			`if($started_up !== TRUE)`
			`return $started_up;`

			`# Look if the request is okay`
			`if($this->_check_category($category) === FALSE)`
			`return self::TRAINER_CATEGORY_FAIL;`

			`# Get all tokens from $text`

			`$tokens = $this->_lexer->get_tokens($text);`

			`# Check if the lexer failed`
			`# (if so, $tokens will be a lexer error code, if not, $tokens will be an array)`
			`if(!is_array($tokens))`
			`return $tokens;`

			`# Pass the tokens and what to do with it to the storage backend`
			`return $this->_database->process_text($tokens, $category, $action);`

			`}`

			`}`

			`?>`