-
- All Implemented Interfaces:
-
java.lang.AutoCloseable
public interface PulsarContext implements AutoCloseable
Main entry point for Pulsar functionality.
A PulsarContext can be used to inject, fetch, load, parse, store Web pages.
-
-
Method Summary
Modifier and Type Method Description abstract PulsarSessioncreateSession()abstract UnitcloseSession(PulsarSession session)abstract UnitregisterClosable(AutoCloseable closable)Close objects when sessions close abstract NormUrlnormalize(String url, LoadOptions options, Boolean toItemOption)abstract List<NormUrl>normalize(Iterable<String> urls, LoadOptions options, Boolean toItemOption)abstract NormUrlnormalize(UrlAware url, LoadOptions options, Boolean toItemOption)abstract List<NormUrl>normalize(Collection<UrlAware> urls, LoadOptions options, Boolean toItemOption)abstract NormUrlnormalizeOrNull(String url, LoadOptions options, Boolean toItemOption)abstract NormUrlnormalizeOrNull(UrlAware url, LoadOptions options, Boolean toItemOption)abstract WebPageinject(String url)Inject an url abstract WebPageinject(NormUrl url)abstract WebPageget(String url)abstract WebPagegetOrNull(String url)abstract Booleanexists(String url)abstract CheckStatefetchState(WebPage page, LoadOptions options)abstract Iterator<WebPage>scan(String urlPrefix)abstract Iterator<WebPage>scan(String urlPrefix, Iterable<GWebPage.Field> fields)abstract Iterator<WebPage>scan(String urlPrefix, Array<String> fields)abstract WebPageload(String url, LoadOptions options)Load a url with specified options, see LoadOptions for all options abstract WebPageload(URL url, LoadOptions options)Load a url with specified options, see LoadOptions for all options abstract WebPageload(NormUrl url)Load a url, options can be specified following the url, see LoadOptions for all options abstract WebPageloadDeferred(NormUrl url)abstract Collection<WebPage>loadAll(Iterable<String> urls, LoadOptions options)Load a batch of urls with the specified options. abstract Collection<WebPage>loadAll(Iterable<NormUrl> urls)abstract CompletableFuture<WebPage>loadAsync(NormUrl url)abstract List<CompletableFuture<WebPage>>loadAllAsync(Iterable<NormUrl> urls)abstract PulsarContextsubmit(UrlAware url)abstract PulsarContextsubmitAll(Iterable<UrlAware> urls)abstract FeaturedDocumentparse(WebPage page)Parse the WebPage using Jsoup abstract Unitpersist(WebPage page)abstract Unitdelete(String url)abstract Unitdelete(WebPage page)abstract Unitflush()abstract Unitawait()Wait until all tasks are done. abstract UnitregisterShutdownHook()abstract IntegergetId()abstract PulsarEnvironmentgetPulsarEnvironment()abstract ApplicationContextgetApplicationContext()abstract ImmutableConfiggetUnmodifiedConfig()abstract UrlPoolgetCrawlPool()abstract CrawlLoopsgetCrawlLoops()-
-
Method Detail
-
createSession
abstract PulsarSession createSession()
-
closeSession
abstract Unit closeSession(PulsarSession session)
-
registerClosable
abstract Unit registerClosable(AutoCloseable closable)
Close objects when sessions close
-
normalize
abstract NormUrl normalize(String url, LoadOptions options, Boolean toItemOption)
-
normalize
abstract List<NormUrl> normalize(Iterable<String> urls, LoadOptions options, Boolean toItemOption)
-
normalize
abstract NormUrl normalize(UrlAware url, LoadOptions options, Boolean toItemOption)
-
normalize
abstract List<NormUrl> normalize(Collection<UrlAware> urls, LoadOptions options, Boolean toItemOption)
-
normalizeOrNull
abstract NormUrl normalizeOrNull(String url, LoadOptions options, Boolean toItemOption)
-
normalizeOrNull
abstract NormUrl normalizeOrNull(UrlAware url, LoadOptions options, Boolean toItemOption)
-
inject
abstract WebPage inject(String url)
Inject an url
- Parameters:
url- The url followed by config options
-
fetchState
abstract CheckState fetchState(WebPage page, LoadOptions options)
-
load
abstract WebPage load(String url, LoadOptions options)
Load a url with specified options, see LoadOptions for all options
- Parameters:
url- The url followed by optionsoptions- The options
-
load
abstract WebPage load(URL url, LoadOptions options)
Load a url with specified options, see LoadOptions for all options
- Parameters:
url- The url followed by optionsoptions- The options
-
load
abstract WebPage load(NormUrl url)
Load a url, options can be specified following the url, see LoadOptions for all options
- Parameters:
url- The url followed by options
-
loadDeferred
abstract WebPage loadDeferred(NormUrl url)
-
loadAll
abstract Collection<WebPage> loadAll(Iterable<String> urls, LoadOptions options)
Load a batch of urls with the specified options.
If the option indicates prefer parallel, urls are fetched in a parallel manner whenever applicable. If the batch is too large, only a random part of the urls is fetched immediately, all the rest urls are put into a pending fetch list and will be fetched in background later.
If a page exists neither in local storage nor at the given remote location, WebPage.NIL is returned
- Parameters:
urls- The urls to loadoptions- The options
-
loadAll
abstract Collection<WebPage> loadAll(Iterable<NormUrl> urls)
-
loadAsync
abstract CompletableFuture<WebPage> loadAsync(NormUrl url)
-
loadAllAsync
abstract List<CompletableFuture<WebPage>> loadAllAsync(Iterable<NormUrl> urls)
-
submit
abstract PulsarContext submit(UrlAware url)
-
submitAll
abstract PulsarContext submitAll(Iterable<UrlAware> urls)
-
parse
abstract FeaturedDocument parse(WebPage page)
Parse the WebPage using Jsoup
-
registerShutdownHook
abstract Unit registerShutdownHook()
-
getPulsarEnvironment
abstract PulsarEnvironment getPulsarEnvironment()
-
getApplicationContext
abstract ApplicationContext getApplicationContext()
-
getUnmodifiedConfig
abstract ImmutableConfig getUnmodifiedConfig()
-
getCrawlPool
abstract UrlPool getCrawlPool()
-
getCrawlLoops
abstract CrawlLoops getCrawlLoops()
-
-
-
-