Community
DWF
cancel
Showing results for 
Show  only  | Search instead for 
Did you mean: 

How to extract URL from DWF file especially non-english URL?

0 REPLIES 0
Reply
Message 1 of 1
Anonymous
757 Views, 0 Replies

How to extract URL from DWF file especially non-english URL?

Hello,expers

    I am tring to extract all the URL information from specified DWF file now.However,when trying to process the DWF file with chinese characters,it always fail.I have pasted my code fragment as following:

 

DwfW2DHandler::DwfW2DHandler( DWFInputStream& rStream, bool bHtmlOutput)
: rW2DStream( rStream )
, m_iUrlIndex( 0 )

{
//setup the URL action here to catch it.
set_url_action( HandleURL );
set_text_action( HandleText );
set_file_mode( WT_File::File_Read );
set_stream_user_data( this );
open();
}

DwfW2DHandler::~DwfW2DHandler()
{
close();
}


//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_close(void)
{
return WT_Result::Success;
}
//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_end_seek()
{
return WT_Result::Success;
}

//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_open(void)
{
//heuristics().set_allow_binary_data(true);
//heuristics().set_allow_data_compression(true);
nBytesAvailable = rW2DStream.available();
//wcout<< L"DwfW2DHandler::process_stream_open nBytesAvailable = "<<nBytesAvailable<<endl;
return WT_Result::Success;
}
//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_read ( int desired_bytes, int& bytes_read, void* buffer )
{
WT_Result result = WT_Result::Success;
//wcout << L"Enter DwfW2DHandler::process_stream_read desired_bytes = "<<desired_bytes<<endl;
try
{
bytes_read = ( int )rW2DStream.read( buffer, desired_bytes );
}
catch (...)
{
result = WT_Result::Internal_Error;
}
//wcout << L"Exit DwfW2DHandler::process_stream_read bytes_read ="<<bytes_read<<L"result="<<result<<endl;
return result;
}

//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_seek (int distance, int& amount_seeked)
{
WT_Result result = WT_Result::Success;
try
{
amount_seeked = rW2DStream.seek( SEEK_CUR, distance );
}
catch (...)
{
result = WT_Result::Internal_Error;
}
return result;
}
//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_tell (unsigned long *current_file_pointer_position)
{
*current_file_pointer_position = (int)(nBytesAvailable - rW2DStream.available());

return WT_Result::Success;
}

//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::process_stream_write(int size, void const* buffer)
{
return WT_Result::Toolkit_Usage_Error;
}

//some basic required function for derived class to be implemented
WT_Result DwfW2DHandler::Process()
{
//wcout << L"Enter DwfW2DHandler::Process"<<endl;
WT_Result result = WT_Result::Success;

while( WT_Result::Success == ( result = process_next_object() ) ||
WT_Result::Unsupported_DWF_Opcode == result )
{
/*
//wcout << L"DwfW2DHandler::Process process next object success"<<endl;
WT_Object const* pCurrentObj = current_object();
wcout << L"DwfW2DHandler::Process id ="<<pCurrentObj->object_id()<<endl;
if ( ( pCurrentObj->object_id() == WT_Object::URL_ID ) ||
( pCurrentObj->object_id() == WT_Object::URL_List_ID ) )
{
//wcout << L"DwfW2DHandler::Process URL List object found"<<endl;
}

if ( pCurrentObj->object_id() == WT_Object::Text_ID )
{
WT_String const & rTextString = ( ( WT_Text* )pCurrentObj )->string();
wchar_t* pTextString = WT_String::to_wchar( rTextString.length(), rTextString.unicode() );
pTextString = pTextString ? pTextString : L"NULL";
//wcout <<L"Text String ="<<pTextString<<endl;
}
*/
};
return result;
}


WT_Result DwfW2DHandler::HandleURL( WT_URL& rUrl, WT_File& rFile )
{
//wcout << L"Enter DwfW2DHandler::HandleURL"<<endl;
DwfW2DHandler& rThisHandler = ( DwfW2DHandler& )rFile;
WT_URL_List urlList = rUrl.url();

WT_URL_Item* pUrlItem = NULL;
for ( ;( ( pUrlItem = urlList.url_item_from_index( rThisHandler.m_iUrlIndex ) ) != NULL ); rThisHandler.m_iUrlIndex++ )
{
WT_String& rUrl = pUrlItem->address();
wchar_t* pUrlString = WT_String::to_wchar( rUrl.length(), rUrl.unicode() );

//wcout <<L"Length="<<rUrl.length()<<endl;
wcout << L"[Machine-readable URL=" << pUrlString<<L"]"<<endl;
delete [] pUrlString;

//if friendly_name then append that otherwise put the address as URL text
if( pUrlItem->friendly_name() )
{
//wcout << L"friendly name exist"<<endl;
WT_String& rFriendlyNameString = pUrlItem->friendly_name();
wchar_t* pFriendlyName = WT_String::to_wchar( rFriendlyNameString.length(), rFriendlyNameString.unicode() );
//delete when you are done here
//wcout<<L"Length="<<rFriendlyNameString.length()<<endl;
wcout << L"[Human-readable URL=" << pFriendlyName<<L"]"<<endl;
delete[] pFriendlyName;
}


}

return WT_Result::Success;
}

WT_Result DwfW2DHandler::HandleText( WT_Text& rText, WT_File& rFile )
{
//wcout << L"Enter DwfW2DHandler::HandleText"<<endl;
WT_String const & rTextString = rText.string();
wchar_t* pTextString = WT_String::to_wchar( rTextString.length(), rTextString.unicode() );
//wcout <<L"pTextString="<<pTextString<<endl;
return WT_Result::Success;
}

 


According to the testing result,program will simply exit when calling HandleText if the text is chinese. Anyway, the code is long and boring, thanks for reading and help.

 

best regards

0 REPLIES 0

Can't find what you're looking for? Ask the community or share your knowledge.

Post to forums  

”Boost

 

”Tips

 

”Services