2015-09-27 12:02:05 +00:00
< ? php
2017-12-13 07:03:42 +00:00
/**
2020-02-09 15:18:46 +00:00
* @ copyright Copyright ( C ) 2020 , Friendica
*
* @ license GNU AGPL version 3 or any later version
*
* This program is free software : you can redistribute it and / or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation , either version 3 of the
* License , or ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU Affero General Public License for more details .
*
* You should have received a copy of the GNU Affero General Public License
* along with this program . If not , see < https :// www . gnu . org / licenses />.
2017-12-13 07:03:42 +00:00
*
*/
2020-02-09 15:18:46 +00:00
2017-12-13 07:03:42 +00:00
namespace Friendica\Protocol ;
2018-07-20 02:15:21 +00:00
use DOMDocument ;
use DOMXPath ;
2020-07-14 14:15:04 +00:00
use Friendica\Content\PageInfo ;
2020-07-17 04:40:20 +00:00
use Friendica\Content\Text\BBCode ;
2018-07-20 02:15:21 +00:00
use Friendica\Content\Text\HTML ;
2020-07-17 04:40:20 +00:00
use Friendica\Core\Cache\Duration ;
2018-10-29 21:20:46 +00:00
use Friendica\Core\Logger ;
2018-08-11 20:40:44 +00:00
use Friendica\Core\Protocol ;
2020-11-30 06:59:00 +00:00
use Friendica\Core\Worker ;
2018-07-21 12:40:21 +00:00
use Friendica\Database\DBA ;
2019-12-15 23:47:24 +00:00
use Friendica\DI ;
2020-07-17 04:40:20 +00:00
use Friendica\Model\Contact ;
2020-11-29 09:01:51 +00:00
use Friendica\Model\Conversation ;
2018-01-20 23:52:54 +00:00
use Friendica\Model\Item ;
2020-10-31 13:26:08 +00:00
use Friendica\Model\Post ;
2020-04-17 06:35:20 +00:00
use Friendica\Model\Tag ;
2020-07-17 04:40:20 +00:00
use Friendica\Model\User ;
use Friendica\Util\DateTimeFormat ;
2018-01-27 13:25:54 +00:00
use Friendica\Util\Network ;
2020-01-03 14:09:03 +00:00
use Friendica\Util\ParseUrl ;
2020-07-17 04:40:20 +00:00
use Friendica\Util\Strings ;
2018-07-08 13:39:48 +00:00
use Friendica\Util\XML ;
2018-03-08 19:58:35 +00:00
2016-02-16 07:06:55 +00:00
/**
2020-02-09 15:18:46 +00:00
* This class contain functions to import feeds ( RSS / RDF / Atom )
2016-02-16 07:06:55 +00:00
*/
2020-06-17 08:54:44 +00:00
class Feed
{
/**
* consume - process atom feed and update anything / everything we might need to update
*
* $xml = the ( atom ) feed to consume - RSS isn ' t as fully supported but may work for simple feeds .
*
* $importer = the contact_record ( joined to user_record ) of the local user who owns this relationship .
* It is this person ' s stuff that is going to be updated .
* $contact = the person who is sending us stuff . If not set , we MAY be processing a " follow " activity
* from an external network and MAY create an appropriate contact record . Otherwise , we MUST
* have a contact record .
* $hub = should we find a hub declation in the feed , pass it back to our calling process , who might ( or
* might not ) try and subscribe to it .
* $datedir sorts in reverse order
* $pass - by default ( $pass = 0 ) we cannot guarantee that a parent item has been
* imported prior to its children being seen in the stream unless we are certain
* of how the feed is arranged / ordered .
* With $pass = 1 , we only pull parent items out of the stream .
* With $pass = 2 , we only pull children ( comments / likes ) .
*
* So running this twice , first with pass 1 and then with pass 2 will do the right
* thing regardless of feed ordering . This won ' t be adequate in a fully - threaded
* model where comments can have sub - threads . That would require some massive sorting
* to get all the feed items into a mostly linear ordering , and might still require
* recursion .
*
* @ param $xml
* @ param array $importer
* @ param array $contact
* @ param $hub
* @ throws ImagickException
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
public static function consume ( $xml , array $importer , array $contact , & $hub )
{
if ( $contact [ 'network' ] === Protocol :: OSTATUS ) {
2020-06-18 01:44:28 +00:00
Logger :: info ( 'Consume OStatus messages' );
2020-06-17 08:54:44 +00:00
OStatus :: import ( $xml , $importer , $contact , $hub );
return ;
}
if ( $contact [ 'network' ] === Protocol :: FEED ) {
2020-06-18 01:44:28 +00:00
Logger :: info ( 'Consume feeds' );
2020-06-17 08:54:44 +00:00
self :: import ( $xml , $importer , $contact );
return ;
}
if ( $contact [ 'network' ] === Protocol :: DFRN ) {
2020-06-18 01:44:28 +00:00
Logger :: info ( 'Consume DFRN messages' );
2020-06-17 08:54:44 +00:00
$dfrn_importer = DFRN :: getImporter ( $contact [ 'id' ], $importer [ 'uid' ]);
if ( ! empty ( $dfrn_importer )) {
2020-06-18 01:44:28 +00:00
Logger :: info ( 'Now import the DFRN feed' );
2020-11-29 09:01:51 +00:00
DFRN :: import ( $xml , $dfrn_importer , true , Conversation :: PARCEL_LEGACY_DFRN );
2020-06-17 08:54:44 +00:00
return ;
}
}
}
2017-12-13 07:03:42 +00:00
/**
2020-01-19 06:05:23 +00:00
* Read a RSS / RDF / Atom feed and create an item entry for it
2017-12-13 07:03:42 +00:00
*
2019-01-06 21:06:53 +00:00
* @ param string $xml The feed data
* @ param array $importer The user record of the importer
* @ param array $contact The contact record of the feed
2017-12-13 07:03:42 +00:00
*
2020-01-03 14:26:28 +00:00
* @ return array Returns the header and the first item in dry run mode
2019-01-06 21:06:53 +00:00
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
2017-12-13 07:03:42 +00:00
*/
2020-01-03 14:26:28 +00:00
public static function import ( $xml , array $importer = [], array $contact = [])
2020-01-03 14:09:03 +00:00
{
2020-01-03 14:26:28 +00:00
$dryRun = empty ( $importer ) && empty ( $contact );
2017-12-13 07:03:42 +00:00
2020-01-03 14:26:28 +00:00
if ( $dryRun ) {
Logger :: info ( " Test Atom/RSS feed " );
2017-12-13 07:03:42 +00:00
} else {
2020-01-03 14:26:28 +00:00
Logger :: info ( " Import Atom/RSS feed ' " . $contact [ " name " ] . " ' (Contact " . $contact [ " id " ] . " ) for user " . $importer [ " uid " ]);
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $xml )) {
2020-01-03 14:26:28 +00:00
Logger :: info ( 'XML is empty.' );
return [];
2017-12-13 07:03:42 +00:00
}
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
if ( ! empty ( $contact [ 'poll' ])) {
$basepath = $contact [ 'poll' ];
} elseif ( ! empty ( $contact [ 'url' ])) {
$basepath = $contact [ 'url' ];
} else {
$basepath = '' ;
}
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
$doc = new DOMDocument ();
@ $doc -> loadXML ( trim ( $xml ));
2017-12-17 20:24:57 +00:00
$xpath = new DOMXPath ( $doc );
2019-10-24 22:32:35 +00:00
$xpath -> registerNamespace ( 'atom' , ActivityNamespace :: ATOM1 );
2017-12-13 07:03:42 +00:00
$xpath -> registerNamespace ( 'dc' , " http://purl.org/dc/elements/1.1/ " );
$xpath -> registerNamespace ( 'content' , " http://purl.org/rss/1.0/modules/content/ " );
$xpath -> registerNamespace ( 'rdf' , " http://www.w3.org/1999/02/22-rdf-syntax-ns# " );
$xpath -> registerNamespace ( 'rss' , " http://purl.org/rss/1.0/ " );
$xpath -> registerNamespace ( 'media' , " http://search.yahoo.com/mrss/ " );
2019-10-24 22:32:35 +00:00
$xpath -> registerNamespace ( 'poco' , ActivityNamespace :: POCO );
2015-09-27 12:02:05 +00:00
2018-01-15 13:05:12 +00:00
$author = [];
2018-02-14 04:58:46 +00:00
$entries = null ;
2017-10-16 20:31:13 +00:00
2017-12-13 07:03:42 +00:00
// Is it RDF?
if ( $xpath -> query ( '/rdf:RDF/rss:channel' ) -> length > 0 ) {
2018-07-10 12:27:56 +00:00
$author [ " author-link " ] = XML :: getFirstNodeValue ( $xpath , '/rdf:RDF/rss:channel/rss:link/text()' );
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/rdf:RDF/rss:channel/rss:title/text()' );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-name " ])) {
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/rdf:RDF/rss:channel/rss:description/text()' );
2017-03-15 06:00:22 +00:00
}
2017-12-13 07:03:42 +00:00
$entries = $xpath -> query ( '/rdf:RDF/rss:item' );
2017-03-15 06:00:22 +00:00
}
2016-02-14 14:02:59 +00:00
2017-12-13 07:03:42 +00:00
// Is it Atom?
if ( $xpath -> query ( '/atom:feed' ) -> length > 0 ) {
2018-07-10 12:27:56 +00:00
$alternate = XML :: getFirstAttributes ( $xpath , " atom:link[@rel='alternate'] " );
2017-12-13 07:03:42 +00:00
if ( is_object ( $alternate )) {
2018-03-10 23:35:24 +00:00
foreach ( $alternate AS $attribute ) {
if ( $attribute -> name == " href " ) {
$author [ " author-link " ] = $attribute -> textContent ;
2017-03-15 06:00:22 +00:00
}
}
}
2016-02-14 14:02:59 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-link " ])) {
$self = XML :: getFirstAttributes ( $xpath , " atom:link[@rel='self'] " );
2017-12-13 07:03:42 +00:00
if ( is_object ( $self )) {
2018-03-10 23:35:24 +00:00
foreach ( $self AS $attribute ) {
if ( $attribute -> name == " href " ) {
$author [ " author-link " ] = $attribute -> textContent ;
2017-12-13 07:03:42 +00:00
}
}
}
}
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-link " ])) {
$author [ " author-link " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:id/text()' );
2017-12-13 07:03:42 +00:00
}
2018-07-10 12:27:56 +00:00
$author [ " author-avatar " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:logo/text()' );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:title/text()' );
2016-07-08 20:31:11 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-name " ])) {
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:subtitle/text()' );
2017-03-15 06:00:22 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-name " ])) {
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:author/atom:name/text()' );
2017-03-15 06:00:22 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:displayName/text()' );
2017-03-15 06:00:22 +00:00
if ( $value != " " ) {
2017-12-13 07:03:42 +00:00
$author [ " author-name " ] = $value ;
2017-03-15 06:00:22 +00:00
}
2020-01-03 14:09:03 +00:00
2020-01-03 14:26:28 +00:00
if ( $dryRun ) {
2018-11-10 13:24:10 +00:00
$author [ " author-id " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:author/atom:id/text()' );
// See https://tools.ietf.org/html/rfc4287#section-3.2.2
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/atom:uri/text()' );
if ( $value != " " ) {
$author [ " author-link " ] = $value ;
}
2017-12-13 07:03:42 +00:00
2018-07-10 12:27:56 +00:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:preferredUsername/text()' );
2017-12-13 07:03:42 +00:00
if ( $value != " " ) {
$author [ " author-nick " ] = $value ;
}
2020-01-03 14:09:03 +00:00
2018-07-08 11:46:05 +00:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:address/poco:formatted/text()' );
2017-12-13 07:03:42 +00:00
if ( $value != " " ) {
$author [ " author-location " ] = $value ;
}
2020-01-03 14:09:03 +00:00
2018-07-08 11:46:05 +00:00
$value = XML :: getFirstNodeValue ( $xpath , 'atom:author/poco:note/text()' );
2017-12-13 07:03:42 +00:00
if ( $value != " " ) {
$author [ " author-about " ] = $value ;
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
$avatar = XML :: getFirstAttributes ( $xpath , " atom:author/atom:link[@rel='avatar'] " );
2017-12-13 07:03:42 +00:00
if ( is_object ( $avatar )) {
2018-03-10 23:35:24 +00:00
foreach ( $avatar AS $attribute ) {
if ( $attribute -> name == " href " ) {
$author [ " author-avatar " ] = $attribute -> textContent ;
2017-12-13 07:03:42 +00:00
}
2017-08-21 20:21:04 +00:00
}
}
}
2016-02-14 18:50:59 +00:00
2018-07-08 11:46:05 +00:00
$author [ " edited " ] = $author [ " created " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:updated/text()' );
2015-09-27 12:02:05 +00:00
2018-07-08 11:46:05 +00:00
$author [ " app " ] = XML :: getFirstNodeValue ( $xpath , '/atom:feed/atom:generator/text()' );
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
$entries = $xpath -> query ( '/atom:feed/atom:entry' );
}
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
// Is it RSS?
if ( $xpath -> query ( '/rss/channel' ) -> length > 0 ) {
2018-07-10 12:27:56 +00:00
$author [ " author-link " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/link/text()' );
2016-02-14 14:02:59 +00:00
2018-07-10 12:27:56 +00:00
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/title/text()' );
$author [ " author-avatar " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/image/url/text()' );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-name " ])) {
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/copyright/text()' );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-name " ])) {
$author [ " author-name " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/description/text()' );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
$author [ " edited " ] = $author [ " created " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/pubDate/text()' );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
$author [ " app " ] = XML :: getFirstNodeValue ( $xpath , '/rss/channel/generator/text()' );
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
$entries = $xpath -> query ( '/rss/channel/item' );
}
2020-01-03 14:26:28 +00:00
if ( ! $dryRun ) {
2017-12-13 07:03:42 +00:00
$author [ " author-link " ] = $contact [ " url " ];
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $author [ " author-name " ])) {
2017-12-13 07:03:42 +00:00
$author [ " author-name " ] = $contact [ " name " ];
}
2020-01-03 14:09:03 +00:00
2017-12-13 07:03:42 +00:00
$author [ " author-avatar " ] = $contact [ " thumb " ];
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
$author [ " owner-link " ] = $contact [ " url " ];
$author [ " owner-name " ] = $contact [ " name " ];
$author [ " owner-avatar " ] = $contact [ " thumb " ];
2017-03-15 06:00:22 +00:00
}
2015-09-27 12:02:05 +00:00
2018-01-15 13:05:12 +00:00
$header = [];
2020-01-03 14:26:28 +00:00
$header [ " uid " ] = $importer [ " uid " ] ? ? 0 ;
2018-08-11 20:40:44 +00:00
$header [ " network " ] = Protocol :: FEED ;
2017-12-13 07:03:42 +00:00
$header [ " wall " ] = 0 ;
$header [ " origin " ] = 0 ;
$header [ " gravity " ] = GRAVITY_PARENT ;
2020-03-02 07:57:23 +00:00
$header [ " private " ] = Item :: PUBLIC ;
2019-10-23 22:25:43 +00:00
$header [ " verb " ] = Activity :: POST ;
2019-10-24 22:10:20 +00:00
$header [ " object-type " ] = Activity\ObjectType :: NOTE ;
2015-09-27 12:02:05 +00:00
2020-01-03 14:26:28 +00:00
$header [ " contact-id " ] = $contact [ " id " ] ? ? 0 ;
2015-10-03 11:58:10 +00:00
2017-12-13 07:03:42 +00:00
if ( ! is_object ( $entries )) {
2020-01-03 14:26:28 +00:00
Logger :: info ( " There are no entries in this feed. " );
return [];
2017-12-13 07:03:42 +00:00
}
2016-02-14 14:02:59 +00:00
2018-01-15 13:05:12 +00:00
$items = [];
2020-08-16 17:59:37 +00:00
$creation_dates = [];
2020-03-22 13:05:35 +00:00
// Limit the number of items that are about to be fetched
$total_items = ( $entries -> length - 1 );
$max_items = DI :: config () -> get ( 'system' , 'max_feed_items' );
if (( $max_items > 0 ) && ( $total_items > $max_items )) {
$total_items = $max_items ;
}
2020-11-30 05:39:12 +00:00
$postings = [];
2018-03-10 17:40:21 +00:00
// Importing older entries first
2020-03-22 13:05:35 +00:00
for ( $i = $total_items ; $i >= 0 ; -- $i ) {
2018-03-10 17:40:21 +00:00
$entry = $entries -> item ( $i );
2015-10-14 06:10:06 +00:00
2017-12-13 07:03:42 +00:00
$item = array_merge ( $header , $author );
2018-07-10 12:27:56 +00:00
$alternate = XML :: getFirstAttributes ( $xpath , " atom:link[@rel='alternate'] " , $entry );
2017-12-13 07:03:42 +00:00
if ( ! is_object ( $alternate )) {
2018-07-10 12:27:56 +00:00
$alternate = XML :: getFirstAttributes ( $xpath , " atom:link " , $entry );
2017-12-13 07:03:42 +00:00
}
if ( is_object ( $alternate )) {
2018-03-10 23:35:24 +00:00
foreach ( $alternate AS $attribute ) {
if ( $attribute -> name == " href " ) {
$item [ " plink " ] = $attribute -> textContent ;
2017-12-13 07:03:42 +00:00
}
2017-03-15 06:00:22 +00:00
}
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $item [ " plink " ])) {
2018-07-08 11:46:05 +00:00
$item [ " plink " ] = XML :: getFirstNodeValue ( $xpath , 'link/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $item [ " plink " ])) {
2018-07-08 11:46:05 +00:00
$item [ " plink " ] = XML :: getFirstNodeValue ( $xpath , 'rss:link/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2015-09-27 12:02:05 +00:00
2018-07-08 11:46:05 +00:00
$item [ " uri " ] = XML :: getFirstNodeValue ( $xpath , 'atom:id/text()' , $entry );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $item [ " uri " ])) {
2018-07-08 11:46:05 +00:00
$item [ " uri " ] = XML :: getFirstNodeValue ( $xpath , 'guid/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $item [ " uri " ])) {
2017-12-13 07:03:42 +00:00
$item [ " uri " ] = $item [ " plink " ];
}
2017-04-08 08:12:14 +00:00
2020-07-12 12:45:34 +00:00
// Add the base path if missing
$item [ " uri " ] = Network :: addBasePath ( $item [ " uri " ], $basepath );
$item [ " plink " ] = Network :: addBasePath ( $item [ " plink " ], $basepath );
2017-12-13 07:03:42 +00:00
$orig_plink = $item [ " plink " ];
2017-04-08 08:12:14 +00:00
2020-03-04 21:33:31 +00:00
$item [ " plink " ] = DI :: httpRequest () -> finalUrl ( $item [ " plink " ]);
2017-04-08 08:12:14 +00:00
2018-07-08 11:46:05 +00:00
$item [ " title " ] = XML :: getFirstNodeValue ( $xpath , 'atom:title/text()' , $entry );
2017-01-31 19:39:09 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $item [ " title " ])) {
2018-07-08 11:46:05 +00:00
$item [ " title " ] = XML :: getFirstNodeValue ( $xpath , 'title/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2018-07-10 12:27:56 +00:00
if ( empty ( $item [ " title " ])) {
2018-07-08 11:46:05 +00:00
$item [ " title " ] = XML :: getFirstNodeValue ( $xpath , 'rss:title/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2019-08-18 13:37:31 +00:00
2019-08-22 10:00:21 +00:00
$item [ " title " ] = html_entity_decode ( $item [ " title " ], ENT_QUOTES , 'UTF-8' );
2019-08-18 13:37:31 +00:00
2018-07-08 11:46:05 +00:00
$published = XML :: getFirstNodeValue ( $xpath , 'atom:published/text()' , $entry );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $published )) {
2018-07-08 11:46:05 +00:00
$published = XML :: getFirstNodeValue ( $xpath , 'pubDate/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $published )) {
2018-07-08 11:46:05 +00:00
$published = XML :: getFirstNodeValue ( $xpath , 'dc:date/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-08 11:46:05 +00:00
$updated = XML :: getFirstNodeValue ( $xpath , 'atom:updated/text()' , $entry );
2015-09-27 12:02:05 +00:00
2018-08-29 19:11:43 +00:00
if ( empty ( $updated ) && ! empty ( $published )) {
2017-12-13 07:03:42 +00:00
$updated = $published ;
}
2018-08-29 19:11:43 +00:00
if ( empty ( $published ) && ! empty ( $updated )) {
$published = $updated ;
}
2017-12-13 07:03:42 +00:00
if ( $published != " " ) {
$item [ " created " ] = $published ;
}
2020-01-03 14:09:03 +00:00
2017-12-13 07:03:42 +00:00
if ( $updated != " " ) {
$item [ " edited " ] = $updated ;
}
2020-01-03 14:09:03 +00:00
2020-08-16 17:59:37 +00:00
if ( ! $dryRun ) {
$condition = [ " `uid` = ? AND `uri` = ? AND `network` IN (?, ?) " ,
$importer [ " uid " ], $item [ " uri " ], Protocol :: FEED , Protocol :: DFRN ];
$previous = Item :: selectFirst ([ 'id' , 'created' ], $condition );
if ( DBA :: isResult ( $previous )) {
// Use the creation date when the post had been stored. It can happen this date changes in the feed.
$creation_dates [] = $previous [ 'created' ];
Logger :: info ( " Item with uri " . $item [ " uri " ] . " for user " . $importer [ " uid " ] . " already existed under id " . $previous [ " id " ]);
continue ;
}
$creation_dates [] = DateTimeFormat :: utc ( $item [ 'created' ]);
}
2018-07-08 11:46:05 +00:00
$creator = XML :: getFirstNodeValue ( $xpath , 'author/text()' , $entry );
2015-09-27 12:02:05 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $creator )) {
2018-07-08 11:46:05 +00:00
$creator = XML :: getFirstNodeValue ( $xpath , 'atom:author/atom:name/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $creator )) {
2018-07-08 11:46:05 +00:00
$creator = XML :: getFirstNodeValue ( $xpath , 'dc:creator/text()' , $entry );
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2017-12-13 07:03:42 +00:00
if ( $creator != " " ) {
$item [ " author-name " ] = $creator ;
}
2020-01-03 14:09:03 +00:00
2018-07-08 11:46:05 +00:00
$creator = XML :: getFirstNodeValue ( $xpath , 'dc:creator/text()' , $entry );
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
if ( $creator != " " ) {
$item [ " author-name " ] = $creator ;
}
2017-04-04 17:47:45 +00:00
2017-12-13 07:03:42 +00:00
/// @TODO ?
// <category>Ausland</category>
// <media:thumbnail width="152" height="76" url="http://www.taz.de/picture/667875/192/14388767.jpg"/>
2018-01-15 13:05:12 +00:00
$attachments = [];
2017-12-13 07:03:42 +00:00
2018-03-10 17:40:21 +00:00
$enclosures = $xpath -> query ( " enclosure|atom:link[@rel='enclosure'] " , $entry );
2017-12-13 07:03:42 +00:00
foreach ( $enclosures AS $enclosure ) {
$href = " " ;
2020-11-07 08:22:59 +00:00
$length = null ;
$type = null ;
2017-12-13 07:03:42 +00:00
2018-03-10 23:35:24 +00:00
foreach ( $enclosure -> attributes AS $attribute ) {
if ( in_array ( $attribute -> name , [ " url " , " href " ])) {
$href = $attribute -> textContent ;
} elseif ( $attribute -> name == " length " ) {
2020-11-04 06:58:04 +00:00
$length = ( int ) $attribute -> textContent ;
2018-03-10 23:35:24 +00:00
} elseif ( $attribute -> name == " type " ) {
$type = $attribute -> textContent ;
2017-12-13 07:03:42 +00:00
}
}
2020-01-03 14:09:03 +00:00
2020-11-07 08:22:59 +00:00
if ( ! empty ( $href )) {
$attachments [] = [ 'type' => Post\Media :: DOCUMENT , 'url' => $href , 'mimetype' => $type , 'size' => $length ];
2017-03-15 06:00:22 +00:00
}
2017-12-13 07:03:42 +00:00
}
2015-09-27 12:02:05 +00:00
2020-04-15 05:10:40 +00:00
$taglist = [];
2017-12-13 07:03:42 +00:00
$categories = $xpath -> query ( " category " , $entry );
foreach ( $categories AS $category ) {
2020-05-02 05:08:05 +00:00
$taglist [] = $category -> nodeValue ;
2017-12-13 07:03:42 +00:00
}
2017-12-12 05:35:41 +00:00
2018-07-10 12:27:56 +00:00
$body = trim ( XML :: getFirstNodeValue ( $xpath , 'atom:content/text()' , $entry ));
2017-10-17 09:10:19 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $body )) {
$body = trim ( XML :: getFirstNodeValue ( $xpath , 'content:encoded/text()' , $entry ));
2017-12-13 07:03:42 +00:00
}
2019-03-16 10:59:11 +00:00
$summary = trim ( XML :: getFirstNodeValue ( $xpath , 'atom:summary/text()' , $entry ));
if ( empty ( $summary )) {
$summary = trim ( XML :: getFirstNodeValue ( $xpath , 'description/text()' , $entry ));
2017-12-13 07:03:42 +00:00
}
2019-03-16 10:59:11 +00:00
2018-07-10 12:27:56 +00:00
if ( empty ( $body )) {
2019-03-16 10:59:11 +00:00
$body = $summary ;
$summary = '' ;
}
if ( $body == $summary ) {
$summary = '' ;
2017-12-13 07:03:42 +00:00
}
2017-10-17 09:10:19 +00:00
2017-12-13 07:03:42 +00:00
// remove the content of the title if it is identically to the body
// This helps with auto generated titles e.g. from tumblr
2018-01-20 23:52:54 +00:00
if ( self :: titleIsBody ( $item [ " title " ], $body )) {
2017-12-13 07:03:42 +00:00
$item [ " title " ] = " " ;
}
2018-03-08 19:58:35 +00:00
$item [ " body " ] = HTML :: toBBCode ( $body , $basepath );
2017-10-17 09:10:19 +00:00
2020-09-18 15:25:48 +00:00
// Remove tracking pixels
$item [ " body " ] = preg_replace ( " / \ [img=1x1 \ ]([^ \ [ \ ]]*) \ [ \ /img \ ]/Usi " , '' , $item [ " body " ]);
2017-12-13 07:03:42 +00:00
if (( $item [ " body " ] == '' ) && ( $item [ " title " ] != '' )) {
$item [ " body " ] = $item [ " title " ];
$item [ " title " ] = '' ;
}
2017-10-17 09:10:19 +00:00
2020-08-15 20:05:08 +00:00
if ( $dryRun ) {
$items [] = $item ;
break ;
} elseif ( ! Item :: isValid ( $item )) {
2020-11-30 20:32:56 +00:00
Logger :: info ( 'Feed item is invalid' , [ 'created' => $item [ 'created' ], 'uid' => $item [ 'uid' ], 'uri' => $item [ 'uri' ]]);
continue ;
2020-11-30 20:59:18 +00:00
} elseif ( Item :: isTooOld ( $item )) {
2020-11-30 20:32:56 +00:00
Logger :: info ( 'Feed is too old' , [ 'created' => $item [ 'created' ], 'uid' => $item [ 'uid' ], 'uri' => $item [ 'uri' ]]);
2020-08-15 20:05:08 +00:00
continue ;
}
2018-02-14 04:58:46 +00:00
$preview = '' ;
2017-12-13 07:03:42 +00:00
if ( ! empty ( $contact [ " fetch_further_information " ]) && ( $contact [ " fetch_further_information " ] < 3 )) {
// Handle enclosures and treat them as preview picture
foreach ( $attachments AS $attachment ) {
2020-11-07 08:22:59 +00:00
if ( $attachment [ " mimetype " ] == " image/jpeg " ) {
$preview = $attachment [ " url " ];
2017-12-13 07:03:42 +00:00
}
2017-03-15 06:00:22 +00:00
}
2015-09-27 12:02:05 +00:00
2017-12-13 07:03:42 +00:00
// Remove a possible link to the item itself
$item [ " body " ] = str_replace ( $item [ " plink " ], '' , $item [ " body " ]);
2019-11-18 12:29:27 +00:00
$item [ " body " ] = trim ( preg_replace ( '/\[url\=\](\w+.*?)\[\/url\]/i' , '' , $item [ " body " ]));
2017-10-17 11:39:09 +00:00
2017-12-13 07:03:42 +00:00
// Replace the content when the title is longer than the body
$replace = ( strlen ( $item [ " title " ]) > strlen ( $item [ " body " ]));
2017-10-17 09:58:29 +00:00
2017-12-13 07:03:42 +00:00
// Replace it, when there is an image in the body
if ( strstr ( $item [ " body " ], '[/img]' )) {
$replace = true ;
}
2017-10-17 09:58:29 +00:00
2017-12-13 07:03:42 +00:00
// Replace it, when there is a link in the body
if ( strstr ( $item [ " body " ], '[/url]' )) {
$replace = true ;
}
2017-10-17 09:58:29 +00:00
2020-09-17 10:36:33 +00:00
$saved_body = $item [ " body " ];
$saved_title = $item [ " title " ];
2017-12-13 07:03:42 +00:00
if ( $replace ) {
2019-11-18 12:29:27 +00:00
$item [ " body " ] = trim ( $item [ " title " ]);
2017-12-13 07:03:42 +00:00
}
2019-11-18 12:29:27 +00:00
2020-01-03 14:09:03 +00:00
$data = ParseUrl :: getSiteinfoCached ( $item [ 'plink' ], true );
2019-11-18 12:29:27 +00:00
if ( ! empty ( $data [ 'text' ]) && ! empty ( $data [ 'title' ]) && ( mb_strlen ( $item [ 'body' ]) < mb_strlen ( $data [ 'text' ]))) {
// When the fetched page info text is longer than the body, we do try to enhance the body
2019-11-18 18:09:21 +00:00
if ( ! empty ( $item [ 'body' ]) && ( strpos ( $data [ 'title' ], $item [ 'body' ]) === false ) && ( strpos ( $data [ 'text' ], $item [ 'body' ]) === false )) {
2019-11-18 12:29:27 +00:00
// The body is not part of the fetched page info title or page info text. So we add the text to the body
$item [ 'body' ] .= " \n \n " . $data [ 'text' ];
} else {
// Else we replace the body with the page info text
$item [ 'body' ] = $data [ 'text' ];
}
}
2020-09-17 10:36:33 +00:00
$data = PageInfo :: queryUrl ( $item [ " plink " ], false , $preview , ( $contact [ " fetch_further_information " ] == 2 ), $contact [ " ffi_keyword_denylist " ] ? ? '' );
2020-11-12 05:17:48 +00:00
if ( ! empty ( $data )) {
// Take the data that was provided by the feed if the query is empty
if (( $data [ 'type' ] == 'link' ) && empty ( $data [ 'title' ]) && empty ( $data [ 'text' ])) {
$data [ 'title' ] = $saved_title ;
$item [ " body " ] = $saved_body ;
}
2020-09-17 10:36:33 +00:00
2020-11-12 05:17:48 +00:00
$data_text = strip_tags ( trim ( $data [ 'text' ] ? ? '' ));
$item_body = strip_tags ( trim ( $item [ 'body' ] ? ? '' ));
2020-09-17 13:07:20 +00:00
2020-11-12 05:17:48 +00:00
if ( ! empty ( $data_text ) && (( $data_text == $item_body ) || strstr ( $item_body , $data_text ))) {
$data [ 'text' ] = '' ;
}
2020-09-17 13:07:20 +00:00
2020-11-12 05:17:48 +00:00
// We always strip the title since it will be added in the page information
$item [ " title " ] = " " ;
$item [ " body " ] = $item [ " body " ] . " \n " . PageInfo :: getFooterFromData ( $data , false );
$taglist = $contact [ " fetch_further_information " ] == 2 ? PageInfo :: getTagsFromUrl ( $item [ " plink " ], $preview , $contact [ " ffi_keyword_denylist " ] ? ? '' ) : [];
$item [ " object-type " ] = Activity\ObjectType :: BOOKMARK ;
$attachments = [];
}
2017-12-13 07:03:42 +00:00
} else {
2019-03-16 10:59:11 +00:00
if ( ! empty ( $summary )) {
2019-03-16 11:18:36 +00:00
$item [ " body " ] = '[abstract]' . HTML :: toBBCode ( $summary , $basepath ) . " [/abstract] \n " . $item [ " body " ];
2019-03-16 10:59:11 +00:00
}
2020-01-05 02:19:02 +00:00
if ( ! empty ( $contact [ " fetch_further_information " ]) && ( $contact [ " fetch_further_information " ] == 3 )) {
2020-05-02 05:08:05 +00:00
if ( empty ( $taglist )) {
2020-07-16 06:13:47 +00:00
$taglist = PageInfo :: getTagsFromUrl ( $item [ " plink " ], $preview , $contact [ " ffi_keyword_denylist " ] ? ? '' );
2017-12-13 07:03:42 +00:00
}
2020-05-02 05:43:00 +00:00
$item [ " body " ] .= " \n " . self :: tagToString ( $taglist );
2020-04-15 05:10:40 +00:00
} else {
$taglist = [];
2017-12-13 07:03:42 +00:00
}
2020-01-03 14:09:03 +00:00
2018-01-12 05:55:14 +00:00
// Add the link to the original feed entry if not present in feed
2018-01-29 06:03:39 +00:00
if (( $item [ 'plink' ] != '' ) && ! strstr ( $item [ " body " ], $item [ 'plink' ])) {
2020-01-03 14:09:03 +00:00
$item [ " body " ] .= " [hr][url] " . $item [ 'plink' ] . " [/url] " ;
2017-12-12 05:35:41 +00:00
}
2017-08-27 06:59:07 +00:00
}
2015-09-27 12:02:05 +00:00
2020-08-15 20:05:08 +00:00
Logger :: info ( 'Stored feed' , [ 'item' => $item ]);
2015-09-27 12:02:05 +00:00
2020-08-15 20:05:08 +00:00
$notify = Item :: isRemoteSelf ( $contact , $item );
2016-11-14 06:55:17 +00:00
2020-08-15 20:05:08 +00:00
// Distributed items should have a well formatted URI.
// Additionally we have to avoid conflicts with identical URI between imported feeds and these items.
if ( $notify ) {
$item [ 'guid' ] = Item :: guidFromUri ( $orig_plink , DI :: baseUrl () -> getHostname ());
2020-12-01 22:11:29 +00:00
$item [ 'uri' ] = Item :: newURI ( $item [ 'uid' ], $item [ 'guid' ]);
2020-11-11 07:47:48 +00:00
unset ( $item [ 'thr-parent' ]);
2020-08-15 20:05:08 +00:00
unset ( $item [ 'parent-uri' ]);
2018-05-15 19:29:14 +00:00
2020-08-15 20:05:08 +00:00
// Set the delivery priority for "remote self" to "medium"
$notify = PRIORITY_MEDIUM ;
}
2020-04-15 11:39:00 +00:00
2020-12-01 22:11:29 +00:00
if ( ! Post\Delayed :: exists ( $item [ " uri " ])) {
$postings [] = [ 'item' => $item , 'notify' => $notify ,
'taglist' => $taglist , 'attachments' => $attachments ];
} else {
Logger :: info ( 'Post already exists in the delayed posts queue' , [ 'uri' => $item [ " uri " ]]);
}
2020-11-30 05:39:12 +00:00
}
2015-09-27 12:02:05 +00:00
2020-11-30 05:39:12 +00:00
if ( ! empty ( $postings )) {
2020-12-01 22:11:29 +00:00
$min_posting = DI :: config () -> get ( 'system' , 'minimum_posting_interval' , 0 );
2020-11-30 05:39:12 +00:00
$total = count ( $postings );
if ( $total > 1 ) {
2020-11-30 06:19:10 +00:00
// Posts shouldn't be delayed more than a day
2020-11-30 08:59:29 +00:00
$interval = min ( 1440 , self :: getPollInterval ( $contact ));
2020-12-01 22:11:29 +00:00
$delay = max ( round (( $interval * 60 ) / $total ), 60 * $min_posting );
2020-11-30 06:06:43 +00:00
Logger :: notice ( 'Got posting delay' , [ 'delay' => $delay , 'interval' => $interval , 'items' => $total , 'cid' => $contact [ 'id' ], 'url' => $contact [ 'url' ]]);
2020-11-30 05:39:12 +00:00
} else {
$delay = 0 ;
}
$post_delay = 0 ;
2020-04-15 05:10:40 +00:00
2020-11-30 05:39:12 +00:00
foreach ( $postings as $posting ) {
if ( $delay > 0 ) {
2020-12-01 22:11:29 +00:00
$publish_time = time () + $post_delay ;
Logger :: notice ( 'Got publishing date' , [ 'delay' => $delay , 'cid' => $contact [ 'id' ], 'url' => $contact [ 'url' ]]);
2020-11-30 05:39:12 +00:00
$post_delay += $delay ;
2020-11-30 18:27:02 +00:00
} else {
2020-12-01 22:11:29 +00:00
$publish_time = time ();
}
$last_publish = DI :: pConfig () -> get ( $posting [ 'item' ][ 'uid' ], 'system' , 'last_publish' , 0 , true );
$next_publish = max ( $last_publish + ( 60 * $min_posting ), time ());
if ( $publish_time < $next_publish ) {
2020-12-01 23:46:50 +00:00
Logger :: notice ( 'Adapting publish time' ,
[ 'last' => date ( DateTimeFormat :: MYSQL , $last_publish ),
'next' => date ( DateTimeFormat :: MYSQL , $next_publish ),
'publish' => date ( DateTimeFormat :: MYSQL , $publish_time )]);
2020-12-01 22:11:29 +00:00
$publish_time = $next_publish ;
2020-04-15 05:10:40 +00:00
}
2020-12-01 22:11:29 +00:00
$publish_at = date ( DateTimeFormat :: MYSQL , $publish_time );
2020-11-30 05:39:12 +00:00
2020-12-02 00:05:03 +00:00
Post\Delayed :: add ( $publish_at , $posting [ 'item' ], $posting [ 'notify' ], $posting [ 'taglist' ], $posting [ 'attachments' ]);
2020-12-01 22:11:29 +00:00
DI :: pConfig () -> set ( $item [ 'uid' ], 'system' , 'last_publish' , $next_publish );
2017-12-13 07:03:42 +00:00
}
2017-03-15 06:00:22 +00:00
}
2017-12-13 07:03:42 +00:00
2020-08-16 21:38:26 +00:00
if ( ! $dryRun && DI :: config () -> get ( 'system' , 'adjust_poll_frequency' )) {
2020-08-16 17:59:37 +00:00
self :: adjustPollFrequency ( $contact , $creation_dates );
}
2020-01-03 14:26:28 +00:00
return [ " header " => $author , " items " => $items ];
2015-09-27 12:02:05 +00:00
}
2018-01-20 23:52:54 +00:00
2020-08-16 17:59:37 +00:00
/**
* Automatically adjust the poll frequency according to the post frequency
*
* @ param array $contact
* @ param array $creation_dates
* @ return void
*/
private static function adjustPollFrequency ( array $contact , array $creation_dates )
{
2020-08-17 15:49:34 +00:00
if ( $contact [ 'network' ] != Protocol :: FEED ) {
Logger :: info ( 'Contact is no feed, skip.' , [ 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ], 'network' => $contact [ 'network' ]]);
2020-08-16 17:59:37 +00:00
return ;
}
if ( ! empty ( $creation_dates )) {
// Count the post frequency and the earliest and latest post date
$frequency = [];
$oldest = time ();
$newest = 0 ;
$oldest_date = $newest_date = '' ;
foreach ( $creation_dates as $date ) {
$timestamp = strtotime ( $date );
$day = intdiv ( $timestamp , 86400 );
$hour = $timestamp % 86400 ;
// Only have a look at values from the last seven days
if ((( time () / 86400 ) - $day ) < 7 ) {
if ( empty ( $frequency [ $day ])) {
$frequency [ $day ] = [ 'count' => 1 , 'low' => $hour , 'high' => $hour ];
} else {
++ $frequency [ $day ][ 'count' ];
if ( $frequency [ $day ][ 'low' ] > $hour ) {
$frequency [ $day ][ 'low' ] = $hour ;
}
if ( $frequency [ $day ][ 'high' ] < $hour ) {
$frequency [ $day ][ 'high' ] = $hour ;
}
}
}
if ( $oldest > $day ) {
$oldest = $day ;
$oldest_date = $date ;
}
2020-11-30 06:06:43 +00:00
2020-08-16 17:59:37 +00:00
if ( $newest < $day ) {
$newest = $day ;
$newest_date = $date ;
}
}
2020-08-17 12:25:55 +00:00
if ( count ( $creation_dates ) == 1 ) {
Logger :: info ( 'Feed had posted a single time, switching to daily polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2020-08-17 06:50:51 +00:00
$priority = 8 ; // Poll once a day
2020-08-16 17:59:37 +00:00
}
2020-08-17 12:25:55 +00:00
if ( empty ( $priority ) && ((( time () / 86400 ) - $newest ) > 730 )) {
Logger :: info ( 'Feed had not posted for two years, switching to monthly polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
$priority = 10 ; // Poll every month
}
if ( empty ( $priority ) && ((( time () / 86400 ) - $newest ) > 365 )) {
Logger :: info ( 'Feed had not posted for a year, switching to weekly polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
$priority = 9 ; // Poll every week
}
if ( empty ( $priority ) && empty ( $frequency )) {
Logger :: info ( 'Feed had not posted for at least a week, switching to daily polling' , [ 'newest' => $newest_date , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2020-08-17 06:50:51 +00:00
$priority = 8 ; // Poll once a day
2020-08-16 17:59:37 +00:00
}
if ( empty ( $priority )) {
// Calculate the highest "posts per day" value
2020-08-17 09:53:49 +00:00
$max = 0 ;
2020-08-16 17:59:37 +00:00
foreach ( $frequency as $entry ) {
if (( $entry [ 'count' ] == 1 ) || ( $entry [ 'high' ] == $entry [ 'low' ])) {
continue ;
}
// We take the earliest and latest post day and interpolate the number of post per day
// that would had been created with this post frequency
// Assume at least four hours between oldest and newest post per day - should be okay for news outlets
$duration = max ( $entry [ 'high' ] - $entry [ 'low' ], 14400 );
$ppd = ( 86400 / $duration ) * $entry [ 'count' ];
if ( $ppd > $max ) {
$max = $ppd ;
}
}
2020-08-17 06:47:29 +00:00
if ( $max > 48 ) {
$priority = 1 ; // Poll every quarter hour
} elseif ( $max > 24 ) {
$priority = 2 ; // Poll half an hour
} elseif ( $max > 12 ) {
$priority = 3 ; // Poll hourly
} elseif ( $max > 8 ) {
$priority = 4 ; // Poll every two hours
} elseif ( $max > 4 ) {
$priority = 5 ; // Poll every three hours
} elseif ( $max > 2 ) {
$priority = 6 ; // Poll every six hours
2020-08-16 17:59:37 +00:00
} else {
2020-08-17 12:25:55 +00:00
$priority = 7 ; // Poll twice a day
2020-08-16 17:59:37 +00:00
}
Logger :: info ( 'Calculated priority by the posts per day' , [ 'priority' => $priority , 'max' => round ( $max , 2 ), 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
}
} else {
Logger :: info ( 'No posts, switching to daily polling' , [ 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
2020-08-17 06:50:51 +00:00
$priority = 8 ; // Poll once a day
2020-08-16 17:59:37 +00:00
}
2020-08-16 21:38:26 +00:00
if ( $contact [ 'rating' ] != $priority ) {
Logger :: notice ( 'Adjusting priority' , [ 'old' => $contact [ 'rating' ], 'new' => $priority , 'id' => $contact [ 'id' ], 'uid' => $contact [ 'uid' ], 'url' => $contact [ 'url' ]]);
DBA :: update ( 'contact' , [ 'rating' => $priority ], [ 'id' => $contact [ 'id' ]]);
2020-08-16 17:59:37 +00:00
}
}
2020-11-30 06:19:10 +00:00
/**
* Get the poll interval for the given contact array
*
* @ param array $contact
* @ return int Poll interval in minutes
*/
public static function getPollInterval ( array $contact )
{
if ( in_array ( $contact [ 'network' ], [ Protocol :: MAIL , Protocol :: FEED ])) {
$ratings = [ 0 , 3 , 7 , 8 , 9 , 10 ];
if ( DI :: config () -> get ( 'system' , 'adjust_poll_frequency' ) && ( $contact [ 'network' ] == Protocol :: FEED )) {
$rating = $contact [ 'rating' ];
} elseif ( array_key_exists ( $contact [ 'priority' ], $ratings )) {
$rating = $ratings [ $contact [ 'priority' ]];
} else {
$rating = - 1 ;
}
} else {
// Check once a week per default for all other networks
$rating = 9 ;
}
// Friendica and OStatus are checked once a day
if ( in_array ( $contact [ 'network' ], [ Protocol :: DFRN , Protocol :: OSTATUS ])) {
$rating = 8 ;
}
// Check archived contacts or contacts with unsupported protocols once a month
if ( $contact [ 'archive' ] || in_array ( $contact [ 'network' ], [ Protocol :: ZOT , Protocol :: PHANTOM ])) {
$rating = 10 ;
}
if ( $rating < 0 ) {
return 0 ;
}
/*
* Based on $contact [ 'priority' ], should we poll this site now ? Or later ?
*/
$min_poll_interval = max ( 1 , DI :: config () -> get ( 'system' , 'min_poll_interval' ));
$poll_intervals = [ $min_poll_interval , 15 , 30 , 60 , 120 , 180 , 360 , 720 , 1440 , 10080 , 43200 ];
//$poll_intervals = [$min_poll_interval . ' minute', '15 minute', '30 minute',
// '1 hour', '2 hour', '3 hour', '6 hour', '12 hour' ,'1 day', '1 week', '1 month'];
return $poll_intervals [ $rating ];
}
2020-05-02 05:43:00 +00:00
/**
* Convert a tag array to a tag string
*
* @ param array $tags
* @ return string tag string
*/
private static function tagToString ( array $tags )
{
$tagstr = '' ;
foreach ( $tags as $tag ) {
if ( $tagstr != " " ) {
$tagstr .= " , " ;
}
2020-11-30 06:06:43 +00:00
2020-05-02 05:43:00 +00:00
$tagstr .= " #[url= " . DI :: baseUrl () . " /search?tag= " . urlencode ( $tag ) . " ] " . $tag . " [/url] " ;
}
return $tagstr ;
}
2018-01-20 23:52:54 +00:00
private static function titleIsBody ( $title , $body )
{
$title = strip_tags ( $title );
$title = trim ( $title );
$title = html_entity_decode ( $title , ENT_QUOTES , 'UTF-8' );
$title = str_replace ([ " \n " , " \r " , " \t " , " " ], [ " " , " " , " " , " " ], $title );
$body = strip_tags ( $body );
$body = trim ( $body );
$body = html_entity_decode ( $body , ENT_QUOTES , 'UTF-8' );
$body = str_replace ([ " \n " , " \r " , " \t " , " " ], [ " " , " " , " " , " " ], $body );
if ( strlen ( $title ) < strlen ( $body )) {
$body = substr ( $body , 0 , strlen ( $title ));
}
if (( $title != $body ) && ( substr ( $title , - 3 ) == " ... " )) {
$pos = strrpos ( $title , " ... " );
if ( $pos > 0 ) {
$title = substr ( $title , 0 , $pos );
$body = substr ( $body , 0 , $pos );
}
}
return ( $title == $body );
}
2020-07-17 04:40:20 +00:00
2020-07-17 04:46:42 +00:00
/**
2020-07-17 04:40:20 +00:00
* Creates the Atom feed for a given nickname
*
* Supported filters :
* - activity ( default ) : all the public posts
* - posts : all the public top - level posts
* - comments : all the public replies
*
* Updates the provided last_update parameter if the result comes from the
* cache or it is empty
*
* @ param string $owner_nick Nickname of the feed owner
* @ param string $last_update Date of the last update
* @ param integer $max_items Number of maximum items to fetch
* @ param string $filter Feed items filter ( activity , posts or comments )
* @ param boolean $nocache Wether to bypass caching
*
* @ return string Atom feed
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
* @ throws \ImagickException
*/
public static function atom ( $owner_nick , $last_update , $max_items = 300 , $filter = 'activity' , $nocache = false )
{
$stamp = microtime ( true );
$owner = User :: getOwnerDataByNick ( $owner_nick );
if ( ! $owner ) {
return ;
}
$cachekey = " feed:feed: " . $owner_nick . " : " . $filter . " : " . $last_update ;
$previous_created = $last_update ;
// Don't cache when the last item was posted less then 15 minutes ago (Cache duration)
if (( time () - strtotime ( $owner [ 'last-item' ])) < 15 * 60 ) {
$result = DI :: cache () -> get ( $cachekey );
if ( ! $nocache && ! is_null ( $result )) {
Logger :: info ( 'Cached feed duration' , [ 'seconds' => number_format ( microtime ( true ) - $stamp , 3 ), 'nick' => $owner_nick , 'filter' => $filter , 'created' => $previous_created ]);
return $result [ 'feed' ];
}
}
$check_date = empty ( $last_update ) ? '' : DateTimeFormat :: utc ( $last_update );
2020-08-07 13:49:59 +00:00
$authorid = Contact :: getIdForURL ( $owner [ " url " ]);
2020-07-17 04:40:20 +00:00
$condition = [ " `uid` = ? AND `received` > ? AND NOT `deleted` AND `gravity` IN (?, ?)
AND `private` != ? AND `visible` AND `wall` AND `parent-network` IN ( ? , ? , ? , ? ) " ,
$owner [ " uid " ], $check_date , GRAVITY_PARENT , GRAVITY_COMMENT ,
Item :: PRIVATE , Protocol :: ACTIVITYPUB ,
Protocol :: OSTATUS , Protocol :: DFRN , Protocol :: DIASPORA ];
if ( $filter === 'comments' ) {
$condition [ 0 ] .= " AND `object-type` = ? " ;
$condition [] = Activity\ObjectType :: COMMENT ;
}
if ( $owner [ 'account-type' ] != User :: ACCOUNT_TYPE_COMMUNITY ) {
$condition [ 0 ] .= " AND `contact-id` = ? AND `author-id` = ? " ;
$condition [] = $owner [ " id " ];
$condition [] = $authorid ;
}
$params = [ 'order' => [ 'received' => true ], 'limit' => $max_items ];
if ( $filter === 'posts' ) {
$ret = Item :: selectThread ([], $condition , $params );
} else {
$ret = Item :: select ([], $condition , $params );
}
$items = Item :: inArray ( $ret );
$doc = new DOMDocument ( '1.0' , 'utf-8' );
$doc -> formatOutput = true ;
$root = self :: addHeader ( $doc , $owner , $filter );
foreach ( $items as $item ) {
$entry = self :: entry ( $doc , $item , $owner );
$root -> appendChild ( $entry );
if ( $last_update < $item [ 'created' ]) {
$last_update = $item [ 'created' ];
}
}
$feeddata = trim ( $doc -> saveXML ());
$msg = [ 'feed' => $feeddata , 'last_update' => $last_update ];
DI :: cache () -> set ( $cachekey , $msg , Duration :: QUARTER_HOUR );
Logger :: info ( 'Feed duration' , [ 'seconds' => number_format ( microtime ( true ) - $stamp , 3 ), 'nick' => $owner_nick , 'filter' => $filter , 'created' => $previous_created ]);
return $feeddata ;
}
/**
* Adds the header elements to the XML document
*
* @ param DOMDocument $doc XML document
* @ param array $owner Contact data of the poster
* @ param string $filter The related feed filter ( activity , posts or comments )
*
* @ return object header root element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function addHeader ( DOMDocument $doc , array $owner , $filter )
{
$root = $doc -> createElementNS ( ActivityNamespace :: ATOM1 , 'feed' );
$doc -> appendChild ( $root );
$title = '' ;
$selfUri = '/feed/' . $owner [ " nick " ] . '/' ;
switch ( $filter ) {
case 'activity' :
$title = DI :: l10n () -> t ( '%s\'s timeline' , $owner [ 'name' ]);
$selfUri .= $filter ;
break ;
case 'posts' :
$title = DI :: l10n () -> t ( '%s\'s posts' , $owner [ 'name' ]);
break ;
case 'comments' :
$title = DI :: l10n () -> t ( '%s\'s comments' , $owner [ 'name' ]);
$selfUri .= $filter ;
break ;
}
$attributes = [ " uri " => " https://friendi.ca " , " version " => FRIENDICA_VERSION . " - " . DB_UPDATE_VERSION ];
XML :: addElement ( $doc , $root , " generator " , FRIENDICA_PLATFORM , $attributes );
XML :: addElement ( $doc , $root , " id " , DI :: baseUrl () . " /profile/ " . $owner [ " nick " ]);
XML :: addElement ( $doc , $root , " title " , $title );
XML :: addElement ( $doc , $root , " subtitle " , sprintf ( " Updates from %s on %s " , $owner [ " name " ], DI :: config () -> get ( 'config' , 'sitename' )));
XML :: addElement ( $doc , $root , " logo " , $owner [ " photo " ]);
XML :: addElement ( $doc , $root , " updated " , DateTimeFormat :: utcNow ( DateTimeFormat :: ATOM ));
$author = self :: addAuthor ( $doc , $owner );
$root -> appendChild ( $author );
$attributes = [ " href " => $owner [ " url " ], " rel " => " alternate " , " type " => " text/html " ];
XML :: addElement ( $doc , $root , " link " , " " , $attributes );
OStatus :: hublinks ( $doc , $root , $owner [ " nick " ]);
$attributes = [ " href " => DI :: baseUrl () . $selfUri , " rel " => " self " , " type " => " application/atom+xml " ];
XML :: addElement ( $doc , $root , " link " , " " , $attributes );
return $root ;
}
/**
* Adds the author element to the XML document
*
* @ param DOMDocument $doc XML document
* @ param array $owner Contact data of the poster
*
* @ return \DOMElement author element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function addAuthor ( DOMDocument $doc , array $owner )
{
$author = $doc -> createElement ( " author " );
XML :: addElement ( $doc , $author , " uri " , $owner [ " url " ]);
XML :: addElement ( $doc , $author , " name " , $owner [ " nick " ]);
XML :: addElement ( $doc , $author , " email " , $owner [ " addr " ]);
return $author ;
}
/**
* Adds an entry element to the XML document
*
* @ param DOMDocument $doc XML document
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param bool $toplevel optional default false
*
* @ return \DOMElement Entry element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
* @ throws \ImagickException
*/
private static function entry ( DOMDocument $doc , array $item , array $owner )
{
$xml = null ;
$repeated_guid = OStatus :: getResharedGuid ( $item );
if ( $repeated_guid != " " ) {
$xml = self :: reshareEntry ( $doc , $item , $owner , $repeated_guid );
}
if ( $xml ) {
return $xml ;
}
return self :: noteEntry ( $doc , $item , $owner );
}
/**
* Adds an entry element with reshared content
*
* @ param DOMDocument $doc XML document
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param string $repeated_guid guid
* @ param bool $toplevel Is it for en entry element ( false ) or a feed entry ( true ) ?
*
* @ return bool Entry element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
* @ throws \ImagickException
*/
private static function reshareEntry ( DOMDocument $doc , array $item , array $owner , $repeated_guid )
{
if (( $item [ 'gravity' ] != GRAVITY_PARENT ) && ( Strings :: normaliseLink ( $item [ " author-link " ]) != Strings :: normaliseLink ( $owner [ " url " ]))) {
Logger :: info ( 'Feed entry author does not match feed owner' , [ 'owner' => $owner [ " url " ], 'author' => $item [ " author-link " ]]);
}
$entry = OStatus :: entryHeader ( $doc , $owner , $item , false );
$condition = [ 'uid' => $owner [ " uid " ], 'guid' => $repeated_guid , 'private' => [ Item :: PUBLIC , Item :: UNLISTED ],
'network' => Protocol :: FEDERATED ];
$repeated_item = Item :: selectFirst ([], $condition );
if ( ! DBA :: isResult ( $repeated_item )) {
return false ;
}
2020-07-17 05:27:45 +00:00
self :: entryContent ( $doc , $entry , $item , self :: getTitle ( $repeated_item ), Activity :: SHARE , false );
2020-07-17 04:40:20 +00:00
self :: entryFooter ( $doc , $entry , $item , $owner );
return $entry ;
}
/**
* Adds a regular entry element
*
* @ param DOMDocument $doc XML document
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param bool $toplevel Is it for en entry element ( false ) or a feed entry ( true ) ?
*
* @ return \DOMElement Entry element
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
* @ throws \ImagickException
*/
private static function noteEntry ( DOMDocument $doc , array $item , array $owner )
{
if (( $item [ 'gravity' ] != GRAVITY_PARENT ) && ( Strings :: normaliseLink ( $item [ " author-link " ]) != Strings :: normaliseLink ( $owner [ " url " ]))) {
Logger :: info ( 'Feed entry author does not match feed owner' , [ 'owner' => $owner [ " url " ], 'author' => $item [ " author-link " ]]);
}
$entry = OStatus :: entryHeader ( $doc , $owner , $item , false );
2020-07-17 05:27:45 +00:00
self :: entryContent ( $doc , $entry , $item , self :: getTitle ( $item ), '' , true );
2020-07-17 04:40:20 +00:00
self :: entryFooter ( $doc , $entry , $item , $owner );
return $entry ;
}
/**
* Adds elements to the XML document
*
* @ param DOMDocument $doc XML document
* @ param \DOMElement $entry Entry element where the content is added
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param string $title Title for the post
* @ param string $verb The activity verb
* @ param bool $complete Add the " status_net " element ?
* @ param bool $feed_mode Behave like a regular feed for users if true
* @ return void
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function entryContent ( DOMDocument $doc , \DOMElement $entry , array $item , $title , $verb = " " , $complete = true )
{
if ( $verb == " " ) {
$verb = OStatus :: constructVerb ( $item );
}
XML :: addElement ( $doc , $entry , " id " , $item [ " uri " ]);
XML :: addElement ( $doc , $entry , " title " , html_entity_decode ( $title , ENT_QUOTES , 'UTF-8' ));
$body = OStatus :: formatPicturePost ( $item [ 'body' ]);
$body = BBCode :: convert ( $body , false , BBCode :: OSTATUS );
XML :: addElement ( $doc , $entry , " content " , $body , [ " type " => " html " ]);
XML :: addElement ( $doc , $entry , " link " , " " , [ " rel " => " alternate " , " type " => " text/html " ,
" href " => DI :: baseUrl () . " /display/ " . $item [ " guid " ]]
);
XML :: addElement ( $doc , $entry , " published " , DateTimeFormat :: utc ( $item [ " created " ] . " +00:00 " , DateTimeFormat :: ATOM ));
XML :: addElement ( $doc , $entry , " updated " , DateTimeFormat :: utc ( $item [ " edited " ] . " +00:00 " , DateTimeFormat :: ATOM ));
}
/**
* Adds the elements at the foot of an entry to the XML document
*
* @ param DOMDocument $doc XML document
* @ param object $entry The entry element where the elements are added
* @ param array $item Data of the item that is to be posted
* @ param array $owner Contact data of the poster
* @ param bool $complete default true
* @ return void
* @ throws \Friendica\Network\HTTPException\InternalServerErrorException
*/
private static function entryFooter ( DOMDocument $doc , $entry , array $item , array $owner )
{
$mentioned = [];
if ( $item [ 'gravity' ] != GRAVITY_PARENT ) {
$parent = Item :: selectFirst ([ 'guid' , 'author-link' , 'owner-link' ], [ 'id' => $item [ 'parent' ]]);
2020-11-11 07:47:48 +00:00
$thrparent = Item :: selectFirst ([ 'guid' , 'author-link' , 'owner-link' , 'plink' ], [ 'uid' => $owner [ " uid " ], 'uri' => $item [ 'thr-parent' ]]);
2020-07-17 04:40:20 +00:00
if ( DBA :: isResult ( $thrparent )) {
$mentioned [ $thrparent [ " author-link " ]] = $thrparent [ " author-link " ];
$mentioned [ $thrparent [ " owner-link " ]] = $thrparent [ " owner-link " ];
$parent_plink = $thrparent [ " plink " ];
} else {
$mentioned [ $parent [ " author-link " ]] = $parent [ " author-link " ];
$mentioned [ $parent [ " owner-link " ]] = $parent [ " owner-link " ];
$parent_plink = DI :: baseUrl () . " /display/ " . $parent [ " guid " ];
}
$attributes = [
2020-11-11 07:47:48 +00:00
" ref " => $item [ 'thr-parent' ],
2020-07-17 04:40:20 +00:00
" href " => $parent_plink ];
XML :: addElement ( $doc , $entry , " thr:in-reply-to " , " " , $attributes );
$attributes = [
" rel " => " related " ,
" href " => $parent_plink ];
XML :: addElement ( $doc , $entry , " link " , " " , $attributes );
}
// uri-id isn't present for follow entry pseudo-items
$tags = Tag :: getByURIId ( $item [ 'uri-id' ] ? ? 0 );
foreach ( $tags as $tag ) {
$mentioned [ $tag [ 'url' ]] = $tag [ 'url' ];
}
foreach ( $tags as $tag ) {
if ( $tag [ 'type' ] == Tag :: HASHTAG ) {
XML :: addElement ( $doc , $entry , " category " , " " , [ " term " => $tag [ 'name' ]]);
}
}
OStatus :: getAttachment ( $doc , $entry , $item );
}
2020-07-17 05:27:45 +00:00
/**
* Fetch or create title for feed entry
*
* @ param array $item
* @ return string title
*/
private static function getTitle ( array $item )
{
if ( $item [ 'title' ] != '' ) {
return BBCode :: convert ( $item [ 'title' ], false , BBCode :: OSTATUS );
}
// Fetch information about the post
$siteinfo = BBCode :: getAttachedData ( $item [ " body " ]);
if ( isset ( $siteinfo [ " title " ])) {
return $siteinfo [ " title " ];
}
// If no bookmark is found then take the first line
// Remove the share element before fetching the first line
$title = trim ( preg_replace ( " / \ [share.*? \ ](.*?) \ [ \ /share \ ]/ism " , " \n $ 1 \n " , $item [ 'body' ]));
$title = HTML :: toPlaintext ( BBCode :: convert ( $title , false ), 0 , true ) . " \n " ;
$pos = strpos ( $title , " \n " );
$trailer = " " ;
if (( $pos == 0 ) || ( $pos > 100 )) {
$pos = 100 ;
$trailer = " ... " ;
}
return substr ( $title , 0 , $pos ) . $trailer ;
}
2015-09-27 12:02:05 +00:00
}