1
+ <?php
2
+
3
+ namespace Swader \Diffbot \Entity ;
4
+
5
+ use Swader \Diffbot \Abstracts \Entity ;
6
+
7
+ class Article extends Entity
8
+ {
9
+ /**
10
+ * Should always return "article"
11
+ * @return string
12
+ */
13
+ public function getType ()
14
+ {
15
+ return $ this ->objects ['type ' ];
16
+ }
17
+
18
+ /**
19
+ * Returns the URL which was crawled
20
+ * @return string
21
+ */
22
+ public function getPageUrl ()
23
+ {
24
+ return $ this ->objects ['pageUrl ' ];
25
+ }
26
+
27
+ /**
28
+ * Returns page Url which was resolved by redirects, if any.
29
+ * For example, crawling a bitly link will make this method return the ultimate destination's URL
30
+ * @return string
31
+ */
32
+ public function getResolvedPageUrl ()
33
+ {
34
+ return (isset ($ this ->objects ['resolvedPageUrl ' ])) ? $ this ->objects ['resolvedPageUrl ' ] : $ this ->getPageUrl ();
35
+ }
36
+
37
+ /**
38
+ * Returns title of article as deducted by Diffbot
39
+ * @return string
40
+ */
41
+ public function getTitle ()
42
+ {
43
+ return $ this ->objects ['title ' ];
44
+ }
45
+
46
+ /**
47
+ * Returns plaintext version of article (no HTML) as parsed by Diffbot.
48
+ * Only the content is returned, the text in the surrounding (layout etc) elements is ignored.
49
+ * @return string
50
+ */
51
+ public function getText ()
52
+ {
53
+ return $ this ->objects ['text ' ];
54
+ }
55
+
56
+ /**
57
+ * Returns full HTML of the article's content - only the content, not the surrounding layout HTML.
58
+ * @return string
59
+ */
60
+ public function getHtml ()
61
+ {
62
+ return $ this ->objects ['html ' ];
63
+ }
64
+
65
+ /**
66
+ * Returns date as per http://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3
67
+ * Example date: "Wed, 18 Dec 2013 00:00:00 GMT"
68
+ * Note that this is "strtotime" friendly for further conversions
69
+ * @todo add more formats as method arguments
70
+ * @return string
71
+ */
72
+ public function getDate ()
73
+ {
74
+ return $ this ->objects ['date ' ];
75
+ }
76
+
77
+ /**
78
+ * Returns the full name of the author, as signed on the article's page
79
+ * @return string
80
+ */
81
+ public function getAuthor ()
82
+ {
83
+ return $ this ->objects ['author ' ];
84
+ }
85
+
86
+ /**
87
+ * The array returned will contain all tags that Diffbot's AI concluded match the content
88
+ *
89
+ * Note that these are *not* the meta tags as defined by the author, but machine learned ones.
90
+ * Note also that tags may differ depending on URL. Visiting a bitly link vs visiting a fully resolved one
91
+ * will sometimes yield different results. It is currently unknown why this happens.
92
+ * The format of the array is:
93
+ *
94
+ * [
95
+ * [
96
+ * "id": 133907,
97
+ * "count": 3,
98
+ * "prevalence": 0.3103448275862069,
99
+ * "label": "Apache HTTP Server",
100
+ * "type": "Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity",
101
+ * "uri": "http://dbpedia.org/resource/Apache_HTTP_Server"
102
+ * ],
103
+ * [
104
+ * "id": 208652,
105
+ * "count": 5,
106
+ * "prevalence": 0.5172413793103449,
107
+ * "label": "PHP",
108
+ * "type": "Http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#InformationEntity",
109
+ * "uri": "http://dbpedia.org/resource/PHP"
110
+ * ]
111
+ * ]
112
+ *
113
+ * @return array
114
+ */
115
+ public function getTags ()
116
+ {
117
+ return $ this ->objects ['tags ' ];
118
+ }
119
+
120
+ /**
121
+ * Alias for getLang()
122
+ * @see getLang()
123
+ * @return string
124
+ */
125
+ public function getHumanLanguage ()
126
+ {
127
+ return $ this ->getLang ();
128
+ }
129
+
130
+ /**
131
+ * Returns the human language as determined by Diffbot when looking at content.
132
+ * The code returned is a two-character ISO 639-1 code: http://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
133
+ * @return string
134
+ */
135
+ public function getLang ()
136
+ {
137
+ return $ this ->objects ['humanLanguage ' ];
138
+ }
139
+
140
+ /**
141
+ * Number of pages automatically concatenated to form the text or html response.
142
+ * By default, Diffbot will automatically concatenate up to 20 pages of an article.
143
+ * @see http://support.diffbot.com/automatic-apis/handling-multiple-page-articles/
144
+ * @return int
145
+ */
146
+ public function getNumPages ()
147
+ {
148
+ return (isset ($ this ->objects ['numPages ' ])) ? $ this ->objects ['numPages ' ] : 1 ;
149
+ }
150
+
151
+ /**
152
+ * Array of all page URLs concatenated in a multipage article.
153
+ * Empty array if article was not concatenated before being returned.
154
+ * @see http://support.diffbot.com/automatic-apis/handling-multiple-page-articles/
155
+ * @return array
156
+ */
157
+ public function getNextPages ()
158
+ {
159
+ return (isset ($ this ->objects ['nextPages ' ])) ? $ this ->objects ['nextPages ' ] : [];
160
+ }
161
+
162
+ /**
163
+ * Returns an array of images found in the article's content.
164
+ *
165
+ * Note that this (tries) to ignore content-unrelated images like ads arounds the page, etc.
166
+ * The format of the array will be:
167
+ *
168
+ * [
169
+ * {
170
+ * "height": 808,
171
+ * "diffbotUri": "image|3|-543943368",
172
+ * "naturalHeight": 808,
173
+ * "width": 717,
174
+ * "primary": true,
175
+ * "naturalWidth": 717,
176
+ * "url": "https://example.com/image1.png"
177
+ * },
178
+ * {
179
+ * "height": 506,
180
+ * "diffbotUri": "image|3|-844014913",
181
+ * "naturalHeight": 506,
182
+ * "width": 715,
183
+ * "naturalWidth": 715,
184
+ * "url": "https://example.com/image1.jpeg"
185
+ * }
186
+ * ]
187
+ *
188
+ * @return array
189
+ */
190
+ public function getImages ()
191
+ {
192
+ return (isset ($ this ->objects ['images ' ])) ? $ this ->objects ['images ' ] : [];
193
+ }
194
+
195
+ /**
196
+ * Returns an array of videos found in the article's content.
197
+ *
198
+ * The format of the array will be:
199
+ *
200
+ * [
201
+ * {
202
+ * "diffbotUri": "video|3|-1138675744",
203
+ * "primary": true,
204
+ * "url": "http://player.vimeo.com/video/22439234"
205
+ * },
206
+ * {
207
+ * "diffbotUri": "video|3|-1138675744",
208
+ * "primary": true,
209
+ * "url": "http://player.vimeo.com/video/22439234"
210
+ * }
211
+ * ]
212
+ *
213
+ * @return array
214
+ */
215
+ public function getVideos () {
216
+ return (isset ($ this ->objects ['images ' ])) ? $ this ->objects ['images ' ] : [];
217
+ }
218
+
219
+ /**
220
+ * An internal identifier for Diffbot, used for indexing in their databases
221
+ * @return string
222
+ */
223
+ public function getDiffbotUri ()
224
+ {
225
+ return $ this ->objects ['diffbotUri ' ];
226
+ }
227
+
228
+ public function getLinks () {
229
+
230
+ }
231
+
232
+ public function getMeta () {
233
+
234
+ }
235
+
236
+ public function getQueryString () {
237
+
238
+ }
239
+ }
0 commit comments